From 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 16 Jul 2021 16:16:43 +0100 Subject: Update GEMM assembly kernels - Introduce Fp32 kernels with internal calculations in Bfloat16 when fast_mode is enabled - Improve kernel selection heuristics Signed-off-by: Georgios Pinitas Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- Android.bp | 45 +- SConstruct | 3 +- arm_compute/core/CPP/CPPTypes.h | 8 +- arm_compute/core/Types.h | 14 +- filelist.json | 51 +- src/common/cpuinfo/CpuModel.cpp | 33 +- src/core/NEON/kernels/arm_gemm/asmlib.hpp | 5 +- src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp | 71 +- src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp | 39 +- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 118 +- src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 23 +- .../NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 147 +- .../kernels/arm_gemm/gemm_hybrid_quantized.hpp | 46 +- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 72 +- .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 94 +- src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp | 116 +- src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp | 94 +- src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 68 +- src/core/NEON/kernels/arm_gemm/gemv_batched.hpp | 14 +- .../NEON/kernels/arm_gemm/gemv_pretransposed.hpp | 101 +- .../a64_interleave8_block4_fp32_bf16.hpp | 169 + .../kernels/arm_gemm/indirect-interleaves/list.hpp | 1 + .../NEON/kernels/arm_gemm/interleave_indirect.cpp | 10 +- .../kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp | 25 +- .../kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp | 26 +- .../kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp | 27 +- .../kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp | 36 +- .../arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp | 82 - .../kernels/a64_gemv_fp32_mla_32/generic.cpp | 1547 -------- .../kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp | 3 +- .../arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp | 4 - .../arm_gemm/kernels/a64_hgemm_8x24/generic.cpp | 4 - .../kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp | 4 - .../kernels/a64_hybrid_bf16fp32_dot_6x16.hpp | 25 +- .../a64_hybrid_bf16fp32_dot_6x16/generic.cpp | 10 +- .../kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp | 103 + .../a64_hybrid_bf16fp32_mmla_6x16/generic.cpp | 3725 ++++++++++++++++++++ .../arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp | 30 +- .../kernels/a64_hybrid_fp16_mla_6x32/a55.cpp | 55 +- .../kernels/a64_hybrid_fp16_mla_6x32/generic.cpp | 5 +- .../arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp | 117 + .../kernels/a64_hybrid_fp32_mla_4x24/a55.cpp | 2807 +++++++++++++++ .../kernels/a64_hybrid_fp32_mla_4x24/generic.cpp | 2595 ++++++++++++++ .../arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp | 38 +- .../kernels/a64_hybrid_fp32_mla_6x16/a55.cpp | 52 +- .../kernels/a64_hybrid_fp32_mla_6x16/generic.cpp | 10 +- .../arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp | 8 +- .../kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp | 103 + .../a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp | 2426 +++++++++++++ .../kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp | 103 + .../a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp | 3137 +++++++++++++++++ .../arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp | 30 +- .../kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp | 18 +- .../kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp | 174 +- .../arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp | 102 + .../kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp | 2104 +++++++++++ .../arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp | 30 +- .../kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp | 12 +- .../kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp | 384 +- .../arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp | 102 + .../kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp | 3640 +++++++++++++++++++ .../arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp | 41 +- .../kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp | 10 +- .../kernels/a64_hybrid_s8s32_mmla_6x16.hpp | 113 + .../kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp | 3463 ++++++++++++++++++ .../arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp | 30 +- .../kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp | 18 +- .../kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp | 174 +- .../arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp | 102 + .../kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp | 2104 +++++++++++ .../arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp | 41 +- .../kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp | 10 +- .../kernels/a64_hybrid_u8u32_mmla_6x16.hpp | 113 + .../kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp | 3463 ++++++++++++++++++ .../kernels/a64_interleaved_bf16fp32_dot_8x12.hpp | 65 +- .../a64_interleaved_bf16fp32_dot_8x12/generic.cpp | 508 ++- .../a64_interleaved_bf16fp32_dot_8x12/x1.cpp | 330 ++ .../kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp | 77 +- .../a64_interleaved_bf16fp32_mmla_8x12/generic.cpp | 647 ++-- .../kernels/a64_interleaved_fp16_mla_8x24.hpp | 110 + .../kernels/a64_interleaved_fp16_mla_8x24/a55.cpp | 263 ++ .../a64_interleaved_fp16_mla_8x24/generic.cpp | 247 ++ .../kernels/a64_interleaved_fp16_mla_8x24/x1.cpp | 247 ++ .../kernels/a64_interleaved_fp32_mla_8x12.hpp | 115 + .../kernels/a64_interleaved_fp32_mla_8x12/a55.cpp | 360 ++ .../a64_interleaved_fp32_mla_8x12/generic.cpp | 320 ++ .../kernels/a64_interleaved_fp32_mla_8x12/x1.cpp | 320 ++ .../kernels/a64_interleaved_s8s32_dot_8x12.hpp | 110 + .../kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp | 273 ++ .../a64_interleaved_s8s32_dot_8x12/generic.cpp | 253 ++ .../kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp | 253 ++ .../kernels/a64_interleaved_s8s32_mmla_8x12.hpp | 75 +- .../a64_interleaved_s8s32_mmla_8x12/generic.cpp | 685 ++-- .../kernels/a64_interleaved_u8u32_dot_8x12.hpp | 110 + .../kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp | 273 ++ .../a64_interleaved_u8u32_dot_8x12/generic.cpp | 253 ++ .../kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp | 253 ++ .../kernels/a64_interleaved_u8u32_mmla_8x12.hpp | 75 +- .../a64_interleaved_u8u32_mmla_8x12/generic.cpp | 644 ++-- .../kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp | 33 +- .../a64_smallK_hybrid_fp32_mla_6x4/generic.cpp | 1 - .../a64_smallK_hybrid_fp32_mla_8x4/generic.cpp | 1 - .../arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp | 82 - .../kernels/sve_gemv_fp32_mla_8VL/generic.cpp | 1372 ------- .../kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp | 26 +- .../sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp | 149 +- .../kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp | 104 + .../sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp | 2045 +++++++++++ .../arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp | 36 +- .../kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp | 1366 +++++++ .../kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp | 147 +- .../arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp | 32 +- .../kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp | 1366 +++++++ .../kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp | 147 +- .../arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp | 18 +- .../kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp | 1143 ++++++ .../kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp | 258 +- .../kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp | 104 + .../sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp | 1306 +++++++ .../kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp | 104 + .../sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp | 1793 ++++++++++ .../arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp | 24 +- .../kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp | 146 +- .../kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp | 101 + .../kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp | 1418 ++++++++ .../arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp | 24 +- .../kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp | 343 +- .../kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp | 101 + .../kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp | 2431 +++++++++++++ .../kernels/sve_hybrid_s8s32_dot_6x4VL.hpp | 38 +- .../kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp | 1033 ++++++ .../kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp | 147 +- .../kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp | 115 + .../sve_hybrid_s8s32_mmla_6x4VL/generic.cpp | 1675 +++++++++ .../arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp | 24 +- .../kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp | 144 +- .../kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp | 101 + .../kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp | 1418 ++++++++ .../kernels/sve_hybrid_u8u32_dot_6x4VL.hpp | 40 +- .../kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp | 1033 ++++++ .../kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp | 149 +- .../kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp | 115 + .../sve_hybrid_u8u32_mmla_6x4VL/generic.cpp | 1675 +++++++++ .../kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp | 65 +- .../sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp | 517 ++- .../sve_interleaved_bf16fp32_mmla_8x3VL.hpp | 77 +- .../generic.cpp | 632 ++-- .../kernels/sve_interleaved_fp16_mla_8x3VL.hpp | 74 +- .../sve_interleaved_fp16_mla_8x3VL/a64fx.cpp | 269 ++ .../sve_interleaved_fp16_mla_8x3VL/generic.cpp | 502 ++- .../kernels/sve_interleaved_fp32_mla_8x3VL.hpp | 70 +- .../sve_interleaved_fp32_mla_8x3VL/a64fx.cpp | 269 ++ .../sve_interleaved_fp32_mla_8x3VL/generic.cpp | 515 ++- .../sve_interleaved_fp32_mmla_8x3VL/generic.cpp | 2 +- .../kernels/sve_interleaved_s8s32_dot_8x3VL.hpp | 75 +- .../sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp | 270 ++ .../sve_interleaved_s8s32_dot_8x3VL/generic.cpp | 515 ++- .../kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp | 75 +- .../sve_interleaved_s8s32_mmla_8x3VL/generic.cpp | 622 ++-- .../kernels/sve_interleaved_u8u32_dot_8x3VL.hpp | 75 +- .../sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp | 270 ++ .../sve_interleaved_u8u32_dot_8x3VL/generic.cpp | 515 ++- .../kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp | 75 +- .../sve_interleaved_u8u32_mmla_8x3VL/generic.cpp | 622 ++-- .../sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp | 2 +- .../sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp | 232 +- .../sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp | 234 +- .../NEON/kernels/arm_gemm/mergeresults-fp16.cpp | 41 + .../NEON/kernels/arm_gemm/mergeresults-sve.cpp | 4 +- src/core/NEON/kernels/arm_gemm/mergeresults.cpp | 3 +- .../NEON/kernels/arm_gemm/merges/list-fp16.hpp | 24 + src/core/NEON/kernels/arm_gemm/merges/list.hpp | 1 - .../NEON/kernels/arm_gemm/quantize_wrapper.hpp | 15 +- src/core/NEON/kernels/arm_gemm/transform-sve.cpp | 35 + src/core/NEON/kernels/arm_gemm/transform.cpp | 136 + src/core/NEON/kernels/arm_gemm/transform.hpp | 90 +- .../a32_transpose_interleave_8way_32bit.hpp | 14 +- .../transforms/a64_transpose_interleave_128.hpp | 289 ++ .../transforms/a64_transpose_interleave_12_1x4.hpp | 432 +++ .../transforms/a64_transpose_interleave_12_1x8.hpp | 335 ++ .../transforms/a64_transpose_interleave_12_2x2.hpp | 344 ++ .../transforms/a64_transpose_interleave_12_2x4.hpp | 445 +++ .../a64_transpose_interleave_12_2x4_fp32bf16.hpp | 735 ++++ .../a64_transpose_interleave_12_s8s16.hpp | 275 ++ .../a64_transpose_interleave_12_u8u16.hpp | 275 ++ .../a64_transpose_interleave_12way_16bit.hpp | 145 - ...64_transpose_interleave_12way_half_to_float.hpp | 120 - .../transforms/a64_transpose_interleave_16.hpp | 137 + .../transforms/a64_transpose_interleave_16_1x4.hpp | 332 ++ .../transforms/a64_transpose_interleave_16_1x8.hpp | 291 ++ .../transforms/a64_transpose_interleave_16_2x2.hpp | 246 ++ .../transforms/a64_transpose_interleave_16_2x4.hpp | 511 +++ .../a64_transpose_interleave_16_2x4_fp32bf16.hpp | 447 +++ .../transforms/a64_transpose_interleave_24.hpp | 272 ++ .../a64_transpose_interleave_24_2x4_fp32bf16.hpp | 787 +++++ .../a64_transpose_interleave_24_bf16fp32.hpp | 295 ++ .../a64_transpose_interleave_24_fp16fp32.hpp | 295 ++ .../a64_transpose_interleave_24way_16bit.hpp | 130 - .../transforms/a64_transpose_interleave_32_1x4.hpp | 508 +++ .../transforms/a64_transpose_interleave_32_2x2.hpp | 452 +++ .../transforms/a64_transpose_interleave_48.hpp | 245 ++ .../transforms/a64_transpose_interleave_4_1x16.hpp | 319 ++ .../transforms/a64_transpose_interleave_4_1x4.hpp | 338 ++ .../transforms/a64_transpose_interleave_64.hpp | 255 ++ .../a64_transpose_interleave_8way_32bit.hpp | 147 - .../transforms/a64_transpose_interleave_96.hpp | 269 ++ .../NEON/kernels/arm_gemm/transforms/list-sve.hpp | 42 + src/core/NEON/kernels/arm_gemm/transforms/list.hpp | 29 +- .../sve_transpose_interleave_12VL_2x4_fp32bf16.hpp | 376 ++ .../transforms/sve_transpose_interleave_1VL.hpp | 163 + .../sve_transpose_interleave_1VL_1x4.hpp | 310 ++ .../transforms/sve_transpose_interleave_3VL.hpp | 174 + .../sve_transpose_interleave_3VL_1x4.hpp | 368 ++ .../sve_transpose_interleave_3VL_2x2.hpp | 318 ++ .../transforms/sve_transpose_interleave_4VL.hpp | 188 + .../sve_transpose_interleave_4VL_1x4.hpp | 322 ++ .../sve_transpose_interleave_4VL_2x2.hpp | 348 ++ .../sve_transpose_interleave_6VL_1x8.hpp | 295 ++ .../sve_transpose_interleave_6VL_2x4.hpp | 411 +++ .../sve_transpose_interleave_6VL_2x4_fp32bf16.hpp | 238 ++ .../sve_transpose_interleave_6VL_4x2.hpp | 322 ++ .../transforms/sve_transpose_interleave_8VL.hpp | 307 ++ .../sve_transpose_interleave_8VL_1x4.hpp | 286 ++ .../sve_transpose_interleave_8VL_1x8.hpp | 259 ++ .../sve_transpose_interleave_8VL_2x2.hpp | 380 ++ .../sve_transpose_interleave_8VL_2x4.hpp | 465 +++ .../sve_transpose_interleave_8VL_2x4_fp32bf16.hpp | 282 ++ .../transforms/transpose_interleave_common.hpp | 4 +- src/core/NEON/kernels/arm_gemm/utils.hpp | 25 +- src/core/cpu/kernels/assembly/arm_gemm.hpp | 10 +- src/core/cpu/kernels/assembly/gemm_common.hpp | 7 + src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 2 + .../CL/functions/CLGEMMConvolutionLayer.cpp | 2 + src/runtime/NEON/functions/NEConvolutionLayer.cpp | 2 +- src/runtime/cpu/operators/CpuGemm.cpp | 1 + src/runtime/cpu/operators/CpuGemmConvolution.cpp | 9 +- src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp | 1 + .../operators/CpuGemmLowpMatrixMultiplyCore.cpp | 1 + .../operators/internal/CpuGemmAssemblyDispatch.cpp | 10 +- .../operators/internal/CpuGemmAssemblyDispatch.h | 1 + tests/validation/fixtures/GEMMFixture.h | 2 +- 241 files changed, 77593 insertions(+), 10496 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transform-sve.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/transform.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp diff --git a/Android.bp b/Android.bp index 1d9ec1c9c1..e5bb7a6a80 100644 --- a/Android.bp +++ b/Android.bp @@ -198,12 +198,15 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp", "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp", + "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults.cpp", "src/core/NEON/kernels/arm_gemm/misc.cpp", "src/core/NEON/kernels/arm_gemm/quantized.cpp", "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp", "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp", + "src/core/NEON/kernels/arm_gemm/transform-sve.cpp", + "src/core/NEON/kernels/arm_gemm/transform.cpp", "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp", "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp", "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp", @@ -838,30 +841,52 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp", @@ -880,23 +905,39 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp", diff --git a/SConstruct b/SConstruct index 8fa78b4c6f..8dbb68952c 100644 --- a/SConstruct +++ b/SConstruct @@ -309,7 +309,8 @@ if env['fat_binary']: if env['arch'] != 'armv8.2-a': print("Currently fat binary is only supported with armv8.2-a") Exit(1) - env.Append(CXXFLAGS = ['-DENABLE_SVE', '-DARM_COMPUTE_ENABLE_SVE']) + env.Append(CXXFLAGS = ['-DENABLE_SVE', '-DARM_COMPUTE_ENABLE_SVE', + '-DARM_COMPUTE_ENABLE_BF16', '-DARM_COMPUTE_ENABLE_I8MM', '-DARM_COMPUTE_ENABLE_SVEF32MM']) env.Append(CXXFLAGS = ['-DENABLE_NEON', '-DARM_COMPUTE_ENABLE_NEON']) if env['data_type_support']: diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index 4484271d63..76378d27ef 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -34,13 +34,15 @@ namespace arm_compute X(GENERIC) \ X(GENERIC_FP16) \ X(GENERIC_FP16_DOT) \ - X(A35) \ X(A53) \ X(A55r0) \ X(A55r1) \ + X(A35) \ X(A73) \ - X(KLEIN) \ - X(X1) + X(A510) \ + X(X1) \ + X(V1) \ + X(A64FX) /** CPU models types * diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index f6658e7544..9c00cbc88c 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1948,6 +1948,7 @@ public: _reinterpret_input_as_3d(false), _retain_internal_weights(false), _gemmlowp_output_stage(), + _fast_math(false), _fp_mixed_precision(false), _broadcast_bias(false), _pretranpose_B(true), @@ -1967,12 +1968,13 @@ public: * @param[in] retain_internal_weights (Optional) Retain the weights tensor from previous run * @param[in] gemmlowp_output_stage (Optional) GEMMLowp Output stage info * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. + * @param[in] fast_math (Optional) Use a data type of shorter width to improve performance * @param[in] broadcast_bias (Optional) Broadcast the shape of the bias tensor from a vector to a matrix. * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication * @param[in] constant_weights (Optional) Weights have constant values throughout multiple executions */ GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false, - GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false, + GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo(), bool constant_weights = true) noexcept : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), @@ -1981,6 +1983,7 @@ public: _reinterpret_input_as_3d(reinterpret_input_as_3d), _retain_internal_weights(retain_internal_weights), _gemmlowp_output_stage(gemmlowp_output_stage), + _fast_math(fast_math), _fp_mixed_precision(fp_mixed_precision), _broadcast_bias(broadcast_bias), _pretranpose_B(reshape_b_only_on_first_run), @@ -2062,6 +2065,14 @@ public: { return _fp_mixed_precision; }; + /** Flag which specifies if a shorter accumulator to be used. + * + * @return True if a shorter accumulator has to be used + */ + bool fast_math() const + { + return _fast_math; + }; /** Flag which specifies whether to broadcast the shape of the bias tensor. * * @return True if the shape of the bias tensor is to be broadcasted. @@ -2119,6 +2130,7 @@ private: bool _reinterpret_input_as_3d; bool _retain_internal_weights; GEMMLowpOutputStageInfo _gemmlowp_output_stage; + bool _fast_math; bool _fp_mixed_precision; bool _broadcast_bias; bool _pretranpose_B; diff --git a/filelist.json b/filelist.json index 68e6aebf4f..e256744aab 100644 --- a/filelist.json +++ b/filelist.json @@ -1210,12 +1210,14 @@ "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp", "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp", + "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults.cpp", "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp", "src/core/NEON/kernels/arm_gemm/misc.cpp", "src/core/NEON/kernels/arm_gemm/quantized.cpp", "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp", - "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp" + "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp", + "src/core/NEON/kernels/arm_gemm/transform.cpp" ], "neon": { "estate32": [ @@ -1234,30 +1236,52 @@ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp", @@ -1280,29 +1304,46 @@ }, "sve": { "all": [ - "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp" + "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp", + "src/core/NEON/kernels/arm_gemm/transform-sve.cpp" ] } } diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp index 9f4d5d1433..2328f62515 100644 --- a/src/common/cpuinfo/CpuModel.cpp +++ b/src/common/cpuinfo/CpuModel.cpp @@ -50,8 +50,10 @@ bool model_supports_fp16(CpuModel model) case CpuModel::GENERIC_FP16: case CpuModel::GENERIC_FP16_DOT: case CpuModel::A55r1: + case CpuModel::A510: case CpuModel::X1: - case CpuModel::KLEIN: + case CpuModel::V1: + case CpuModel::A64FX: return true; default: return false; @@ -64,8 +66,10 @@ bool model_supports_dot(CpuModel model) { case CpuModel::GENERIC_FP16_DOT: case CpuModel::A55r1: + case CpuModel::A510: case CpuModel::X1: - case CpuModel::KLEIN: + case CpuModel::V1: + case CpuModel::A64FX: return true; default: return false; @@ -76,7 +80,9 @@ bool model_supports_sve(CpuModel model) { switch(model) { - case CpuModel::KLEIN: + case CpuModel::A510: + case CpuModel::V1: + case CpuModel::A64FX: return true; default: return false; @@ -92,9 +98,9 @@ CpuModel midr_to_model(uint32_t midr) const int variant = (midr >> 20) & 0xF; const int cpunum = (midr >> 4) & 0xFFF; + // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC" if(implementer == 0x41) // Arm CPUs { - // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC" switch(cpunum) { case 0xd03: // A53 @@ -134,11 +140,26 @@ CpuModel midr_to_model(uint32_t midr) case 0xd4a: // E1 model = CpuModel::GENERIC_FP16_DOT; break; + case 0xd40: // V1 + model = CpuModel::V1; + break; case 0xd44: // X1 model = CpuModel::X1; break; case 0xd46: - model = CpuModel::KLEIN; + model = CpuModel::A510; + break; + default: + model = CpuModel::GENERIC; + break; + } + } + else if(implementer == 0x46) + { + switch(cpunum) + { + case 0x001: // A64FX + model = CpuModel::A64FX; break; default: model = CpuModel::GENERIC; @@ -147,7 +168,6 @@ CpuModel midr_to_model(uint32_t midr) } else if(implementer == 0x48) { - // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC" switch(cpunum) { case 0xd40: // A76 @@ -160,7 +180,6 @@ CpuModel midr_to_model(uint32_t midr) } else if(implementer == 0x51) { - // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC" switch(cpunum) { case 0x800: // A73 diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp index 7766656adb..4f2c47bf11 100644 --- a/src/core/NEON/kernels/arm_gemm/asmlib.hpp +++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 Arm Limited. + * Copyright (c) 2017-2018,2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -37,9 +37,6 @@ #define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n" -// Lee's uarchsim hack -//#define ASM_PREFETCH(address) "LDNP x20, x21, " address "\n" - // No preload at all //#define ASM_PREFETCH(address) "" #else diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp index 8244523696..af80c3637c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp @@ -31,73 +31,92 @@ #include "gemv_batched.hpp" #include "gemv_pretransposed.hpp" +#include "kernels/a32_sgemm_8x6.hpp" + #include "kernels/a64_hybrid_bf16fp32_dot_6x16.hpp" +#include "kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp" #include "kernels/a64_interleaved_bf16fp32_dot_8x12.hpp" #include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp" #include "kernels/a64_sgemm_8x12.hpp" -#include "kernels/a32_sgemm_8x6.hpp" + +#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp" #include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp" #include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp" -#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp" namespace arm_gemm { static const GemmImplementation gemm_bf16_methods[] = { +#ifdef __aarch64__ #ifdef ARM_COMPUTE_ENABLE_BF16 #ifdef ARM_COMPUTE_ENABLE_SVE -{ // gemm_bf16_interleaved +// gemm_bf16_interleaved +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_bf16fp32_mmla_8x3VL", [](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>4); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -{ +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "sve_hybrid_bf16fp32_mmla_6x4VL", + [](const GemmArgs &args) { return args._ci->has_svebf16(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_bf16fp32_dot_6x4VL", [](const GemmArgs &args) { return args._ci->has_svebf16(); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && ((args._Ksize <= 128) && (args._Nsize <= 128)); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, -{ // gemm_bf16_interleaved +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_bf16fp32_dot_8x3VL", [](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>2); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -# endif // SVE -{ // gemm_bf16_interleaved +), +#endif // ARM_COMPUTE_ENABLE_SVE +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_bf16fp32_mmla_6x16", + [](const GemmArgs &args) { return args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_bf16fp32_mmla_8x12", [](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_bf16fp32_dot_6x16", [](const GemmArgs &args) { return args._ci->has_bf16(); }, - nullptr, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, -{ // gemm_bf16_interleaved +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_bf16fp32_dot_8x12", [](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>2); }, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif // ARM_COMPUTE_ENABLE_BF16 -#ifdef __aarch64__ -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_sgemm_8x12", nullptr, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, +), +#endif // ARM_COMPUTE_ENABLE_BF16 #elif defined(__arm__) { GemmMethod::GEMM_INTERLEAVED, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp index b41d8dd097..01976132ed 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp @@ -23,7 +23,7 @@ */ // This can only be built if the target/compiler supports FP16 arguments. -#ifdef __ARM_FP16_ARGS +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) #include "arm_gemm.hpp" @@ -43,48 +43,37 @@ namespace arm_gemm { static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = { -#if defined(ARM_COMPUTE_ENABLE_SVE) -{ +#ifdef ARM_COMPUTE_ENABLE_SVE +GemmImplementation<__fp16, __fp16>::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_fp16_mla_6x4VL", [](const GemmArgs &args) { return args._ci->has_sve(); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, - [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, -{ + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles<__fp16>(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +GemmImplementation<__fp16, __fp16>::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp16_mla_8x3VL", [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize > 4); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles<__fp16>(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif - -#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)) +), +#endif // ARM_COMPUTE_ENABLE_SVE +#if defined(__aarch64__) GemmImplementation<__fp16, __fp16>::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_fp16_mla_6x32", -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC [](const GemmArgs &args) { return args._ci->has_fp16(); }, -#else - nullptr, -#endif - [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_fp16_mla_6x32::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles<__fp16>(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } ), GemmImplementation<__fp16, __fp16>::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_hgemm_8x24", -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC [](const GemmArgs &args) { return args._ci->has_fp16(); }, -#else - nullptr, -#endif - [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args, cls_a64_hgemm_8x24::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles<__fp16>(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } ), -#endif // aarch64 && FP16 -#ifdef __aarch64__ { GemmMethod::GEMM_INTERLEAVED, "a64_sgemm_8x12", @@ -124,4 +113,4 @@ template std::vector get_compatible_kernels<__fp16, __fp16, N } // namespace arm_gemm -#endif // __ARM_FP16_ARGS +#endif // defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 1632e301ac..3cf84a614a 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -31,17 +31,22 @@ #include "gemv_pretransposed.hpp" #include "kernels/a32_sgemm_8x6.hpp" -#include "kernels/a64_gemv_fp32_mla_32.hpp" +#include "kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp" +#include "kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp" +#include "kernels/a64_hybrid_fp32_mla_4x24.hpp" #include "kernels/a64_hybrid_fp32_mla_6x16.hpp" #include "kernels/a64_hybrid_fp32_mla_8x4.hpp" +#include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp" #include "kernels/a64_sgemm_8x12.hpp" #include "kernels/a64_sgemm_8x6.hpp" #include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp" #include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp" -#include "kernels/sve_gemv_fp32_mla_8VL.hpp" +#include "kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp" +#include "kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp" #include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp" #include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp" +#include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp" #include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp" #include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp" #include "kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp" @@ -59,57 +64,94 @@ static const GemmImplementation gemm_fp32_methods[] = [](const GemmArgs &args) { return new GemvBatched(args); } }, #ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_BF16 +// "fast mode" (BF16) kernels +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "a64_interleaved_bf16fp32_mmla_8x12", + [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_fp32bf16fp32_mmla_6x16", + [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_fp32bf16fp32_mmla_4x24", + [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +#endif // ARM_COMPUTE_ENABLE_BF16 #ifdef ARM_COMPUTE_ENABLE_SVE -{ +#ifdef ARM_COMPUTE_ENABLE_BF16 +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "sve_interleaved_bf16fp32_mmla_8x3VL", + [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, - "sve_gemv_fp32_mla_8VL", - [](const GemmArgs &args) { return args._ci->has_sve() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, - [](const GemmArgs &args) { return new GemvPretransposed(args); } -}, -#endif -{ + "sve_hybrid_fp32bf16fp32_mmla_6x4VL", + [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, - "a64_gemv_fp32_mla_32", - [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, - nullptr, - [](const GemmArgs &args) { return new GemvPretransposed(args); } -}, - -// MMLA next due to higher throughput (SVE only) -#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVEF32MM) + "sve_hybrid_fp32bf16fp32_mmla_4x6VL", + [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +#endif // ARM_COMPUTE_ENABLE_BF16 +#ifdef ARM_COMPUTE_ENABLE_SVEF32MM +// MMLA next due to higher throughput (which is SVE only) +// Prefer this in all cases, except if fast mode is requested and BF16 is available. { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp32_mmla_8x3VL", [](const GemmArgs &args) { return args._ci->has_svef32mm() && (args._Ksize>4); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return !(args._fast_mode && args._ci->has_bf16()); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, -#endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM - -#ifdef ARM_COMPUTE_ENABLE_SVE -// SVE smallk / hybrid methods +#endif // ARM_COMPUTE_ENABLE_SVEF32MM +// SVE kernels { GemmMethod::GEMM_HYBRID, "sve_smallK_hybrid_fp32_mla_8x1VL", [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize <= 24 && !args._indirect_input; }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + nullptr, [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_fp32_mla_8x1VL", [](const GemmArgs &args) { return args._ci->has_sve(); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (args._Nsize < 12); }, + [](const GemmArgs &args) { return (args._Nsize < 12); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_fp32_mla_6x4VL", [](const GemmArgs &args) { return args._ci->has_sve(); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "sve_interleaved_fp32_mla_8x3VL", + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +), #endif // ARM_COMPUTE_ENABLE_SVE // Cortex-A35 specific kernel - use for any problem on A35, and never in any other cases. { @@ -141,27 +183,25 @@ static const GemmImplementation gemm_fp32_methods[] = [](const GemmArgs &args) { return (args._Nsize < 12); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_fp32_mla_4x24", + nullptr, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_fp32_mla_6x16", nullptr, - [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_fp32_mla_6x16::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } ), -#ifdef ARM_COMPUTE_ENABLE_SVE -{ - GemmMethod::GEMM_INTERLEAVED, - "sve_interleaved_fp32_mla_8x3VL", - [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, - [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif // ARM_COMPUTE_ENABLE_SVE GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_sgemm_8x12", nullptr, - [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args, cls_a64_sgemm_8x12::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } ), #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index d702cffce1..436316c0f7 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -74,7 +74,7 @@ class GemmHybrid : public GemmCommon { } if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; + return roundup(args._cfg->inner_block_size, strategy::k_unroll()); } // Target block size (512 for FP32, scaling for other types). Don't block until size reaches 1.5X this. @@ -97,7 +97,13 @@ class GemmHybrid : public GemmCommon { // single block. static unsigned int compute_n_block(const GemmArgs &args) { if (args._cfg && args._cfg->outer_block_size) { - return args._cfg->outer_block_size; + unsigned int n_block = args._cfg->outer_block_size; + + // Needs to be (at least a single) multiple of the kernel output width. + n_block /= strategy::out_width(); + n_block = std::max(n_block, 1u) * strategy::out_width(); + + return n_block; } if (args._Nsize <= 64) { @@ -264,6 +270,17 @@ public: return total_cycles; } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_HYBRID; + c.inner_block_size = _k_block; + c.outer_block_size = _n_block; + c.filter = get_type_name(); + + return c; + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 41fecc6bec..5cbdf20798 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -55,31 +55,31 @@ namespace { template class run_hybrid_kernel { public: - template - static void run ( + template + static inline void run ( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg output_arg, const Tr *bias_ptr, Activation act, bool accumulate, const OutputStage &os, const int32_t *col_bias, unsigned int n_0 ); }; template<> -template -void run_hybrid_kernel::run( +template +inline void run_hybrid_kernel::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg output_arg, const Tr *bias_ptr, Activation act, bool accumulate, const Nothing &, const int32_t *, unsigned int) { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); #endif UNUSED(kern_k); - /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing + /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing * a partial block and pad the bias for that block. */ if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) { /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */ @@ -112,13 +112,13 @@ void run_hybrid_kernel::run( } template<> -template -void run_hybrid_kernel::run( +template +inline void run_hybrid_kernel::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *, Activation, bool, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg output_arg, const Tr *, Activation, bool, const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); @@ -129,13 +129,13 @@ void run_hybrid_kernel::run( } template<> -template -void run_hybrid_kernel::run( +template +inline void run_hybrid_kernel::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *, Activation, bool, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg output_arg, const Tr *, Activation, bool, const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { UNUSED(kern_k); // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop. @@ -183,7 +183,8 @@ void run_hybrid_kernel::run( // Implementation of the GemmCommon abstract class. template class GemmHybridIndirect : public GemmCommon { - typedef typename strategy::operand_type Toi; + typedef typename strategy::lhs_operand_type Tloi; + typedef typename strategy::rhs_operand_type Troi; typedef typename strategy::result_type Tri; GemmArgs _args; @@ -201,7 +202,7 @@ class GemmHybridIndirect : public GemmCommon { const unsigned int _Mround; /* Pretransposed buffer. */ - const Toi *_B_transposed=nullptr; + const Troi *_B_transposed=nullptr; /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */ const To * const * const * _indirect_buf = nullptr; @@ -233,7 +234,7 @@ class GemmHybridIndirect : public GemmCommon { } if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; + return roundup(args._cfg->inner_block_size, strategy::k_unroll()); } // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other @@ -356,8 +357,8 @@ public: // In convolution mode, we need input pointers. if (_convolver) { - in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr); - in_row_strings.resize(_args._Ksections, nullptr); + in_row_ptrs = std::vector(strategy::out_height() * _args._Ksections, nullptr); + in_row_strings = std::vector(_args._Ksections, nullptr); for (unsigned int i=0; i<_args._Ksections; i++) { in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]); @@ -371,7 +372,7 @@ public: /* Make sure we've been set up correctly. */ assert(_B_transposed); - static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); + static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); // static_assert(std::is_same::value, "gemm_native: Result types must be the same."); /* For now, each work item implies all the K for a given output @@ -422,7 +423,7 @@ public: const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize); const unsigned int multi = p.dim(3); - const Toi *b_panel = _B_transposed + + const Troi *b_panel = _B_transposed + (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) + (k0 * roundup(_args._Nsize, strategy::out_width())) + (n0 * kern_k); @@ -510,7 +511,7 @@ public: size_t get_B_pretransposed_array_size() const override { // Start with actual pretransposed buffer... - size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi); + size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi); // Space for result row pointers (not strictly needed any more but retained for indirect output testing) size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *); @@ -536,7 +537,7 @@ public: // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast(in_buffer); - Toi *buffer = reinterpret_cast(buffer_int + get_col_sum_size()); + Troi *buffer = reinterpret_cast(buffer_int + get_col_sum_size()); _B_transposed = buffer; strategy strat(_args._ci); @@ -548,47 +549,55 @@ public: /* Figure out the size of each block. */ unsigned int k_size = kmax - k0; - // We need to insert padding at the end of each K section. - // The computation needed is a little delicate - the coordinates from the block walker are expressed in - // terms of the full, padded, _Ktotal. - // But we need to transform each section with reference to the original, unpadded, input, letting the - // transform pad each section as needed. + if (_args._Ksections > 1) { + // We need to insert padding at the end of each K section. + // The computation needed is a little delicate - the coordinates from the block walker are expressed in + // terms of the full, padded, _Ktotal. + // But we need to transform each section with reference to the original, unpadded, input, letting the + // transform pad each section as needed. - // This is needed for computations below. - const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll()); + // This is needed for computations below. + const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll()); - // The expected output format is also an entire columns interleaved, then the next set of - // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at - // a time. - for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){ - unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize); + // The expected output format is also an entire columns interleaved, then the next set of + // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at + // a time. + for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){ + unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize); - // Track where we are and how much work is left. - unsigned int kpos = k0; - unsigned int kleft = k_size; + // Track where we are and how much work is left. + unsigned int kpos = k0; + unsigned int kleft = k_size; - while (kleft) { - // Which section are we in? Based on the rounded-up section size. - unsigned int k_section_base = kpos / rounded_section_size; - // How far into the section are we? - unsigned int k_offset = kpos - (k_section_base * rounded_section_size); + while (kleft) { + // Which section are we in? Based on the rounded-up section size. + unsigned int k_section_base = kpos / rounded_section_size; + // How far into the section are we? + unsigned int k_offset = kpos - (k_section_base * rounded_section_size); - // We will either copy the rest of this section, or to the end of the requested length. - unsigned int k_length = std::min(_args._Ksize - k_offset, kleft); + // We will either copy the rest of this section, or to the end of the requested length. + unsigned int k_length = std::min(_args._Ksize - k_offset, kleft); - strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, - x0, xmax, - (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. - (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. + strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, + x0, xmax, + (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. + (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. - // We need to modify our position based on the ROUNDED version of what we just did. - unsigned int padded_length = roundup(k_length, strategy::k_unroll()); + // We need to modify our position based on the ROUNDED version of what we just did. + unsigned int padded_length = roundup(k_length, strategy::k_unroll()); - buffer += strategy::out_width() * padded_length; + buffer += strategy::out_width() * padded_length; - kpos += padded_length; - kleft -= padded_length; + kpos += padded_length; + kleft -= padded_length; + } } + } else { + // In the single K section case, can process the whole lot in one go. + // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize. + strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, + 0, _args._Nsize, k0, std::min(kmax, _args._Ksize)); + buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll()); } } } @@ -597,12 +606,17 @@ public: void set_pretransposed_B_data(void *in_buffer) override { // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast(in_buffer); - _B_transposed = reinterpret_cast(buffer_int + get_col_sum_size()); + _B_transposed = reinterpret_cast(buffer_int + get_col_sum_size()); _col_bias = reinterpret_cast(in_buffer); } - // Estimate cycles for given problem given provided parameters - static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms, const OutputStage &os = {} ) { + // Estimate cycles for given problem given provided parameters. + // "perf_type" is a type to pass along to get_performance_parameters to get the right set of performance + // parameters - it's arbitrary but usually either the input or output type. + template + static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os = {}) { + const PerformanceParameters params = strategy::template get_performance_parameters(args._ci); + // Note: Current hybrid kernels don't actually round up height (they // have paths for each possible height). Might need to make this // configurable in future. @@ -666,6 +680,17 @@ public: assert(parms.input_channels == _args._Ksize); _convolver = std::unique_ptr>(new convolver(parms)); } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_HYBRID; + c.inner_block_size = _k_block; + c.outer_block_size = _n_block; + c.filter = get_type_name(); + + return c; + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp index e48d9b9a07..c72dca2e96 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -81,11 +81,42 @@ class GemmHybridQuantized : public GemmCommon { static unsigned int compute_k_block(const GemmArgs &args) { // We don't support K blocks as we only temporarily store 32 bit results. return args._Ksize; + + if (args._cfg && args._cfg->inner_block_size) { + return args._cfg->inner_block_size; + } + + const unsigned int L1_size = args._ci->get_L1_cache_size(); + + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + k_block /= strategy::k_unroll(); + k_block = std::max(k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int numk_blocks = iceildiv(args._Ksize, k_block); + + // So divide the space equally into that many blocks. + k_block = iceildiv(args._Ksize, numk_blocks); + + // And round UP to the K unroll level required. + k_block = roundup(k_block, strategy::k_unroll()); + + return k_block; } static unsigned int compute_n_block(const GemmArgs &args) { if (args._cfg && args._cfg->outer_block_size) { - return args._cfg->outer_block_size; + unsigned int n_block = args._cfg->outer_block_size; + + // Needs to be (at least a single) multiple of the kernel output width. + n_block /= strategy::out_width(); + n_block = std::max(n_block, 1u) * strategy::out_width(); + + return n_block; } const unsigned int k_block = compute_k_block(args); @@ -279,6 +310,17 @@ public: _qp.bias = bias; _qp.bias_multi_stride = bias_multi_stride; } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_HYBRID; + c.inner_block_size = _k_block; + c.outer_block_size = _n_block; + c.filter = get_type_name(); + + return c; + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index bfb3ca901f..cfbf66d60f 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -34,11 +34,13 @@ #include "kernels/a64_gemm_s8_8x12.hpp" #include "kernels/a64_gemm_s8_4x4.hpp" #include "kernels/a64_hybrid_s8s32_dot_6x16.hpp" +#include "kernels/a64_hybrid_s8s32_mmla_6x16.hpp" #include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp" #include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp" #include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp" #include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp" #include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp" @@ -47,46 +49,56 @@ namespace arm_gemm { static const GemmImplementation gemm_s8_methods[] = { #ifdef ARM_COMPUTE_ENABLE_SVE -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "sve_hybrid_s8s32_mmla_6x4VL", + [](const GemmArgs &args) { return args._ci->has_svei8mm(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_mmla_8x3VL", [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif // ARM_COMPUTE_ENABLE_I8MM +), { GemmMethod::GEMM_HYBRID, "sve_smallK_hybrid_s8s32_dot_8x1VL", - [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return args._ci->has_svei8mm() && args._Ksize<=64 && !args._indirect_input; }, + nullptr, [](const GemmArgs &args) { return new GemmHybrid(args); } }, -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_s8s32_dot_6x4VL", [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize>=16; }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_dot_8x3VL", [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif // SVE -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +), +#endif // ARM_COMPUTE_ENABLE_SVE +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_s8s32_mmla_8x12", - [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif // ARM_COMPUTE_ENABLE_I8MM +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_s8s32_mmla_6x16", + [](const GemmArgs &args) { return args._ci->has_i8mm(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), { GemmMethod::GEMM_HYBRID, "a64_smallK_hybrid_s8s32_dot_8x4", @@ -108,27 +120,29 @@ static const GemmImplementation gemm_s8_methods[] = { [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && ((args._Msize > 28) || ((args._Msize % 8) > 4)); }, [](const GemmArgs &args) { return new GemmInterleaved(args); }, }, -{ +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, "a64_hybrid_s8s32_dot_6x16", [](const GemmArgs &args) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_s8_8x12", [](const GemmArgs &args) { return args._ci->has_dotprod(); }, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_s8_4x4", nullptr, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, +), + { GemmMethod::DEFAULT, "", diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index 7f870b83d7..5639cb4182 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -192,7 +192,7 @@ void kernel_and_merge::run( { #ifdef CYCLE_PROFILING - auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr))); + auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr))); #endif // The interleaved kernel outputs in blocks - each block is a // row-major matrix of size out_width * out_height. The merge @@ -496,7 +496,7 @@ class GemmInterleaved : public GemmCommon { static unsigned int get_k_block_size(const GemmArgs &args) { if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; + return roundup(args._cfg->inner_block_size, strategy::k_unroll()); } // K blocking not supported if we are requantizing. @@ -947,47 +947,55 @@ public: /* Figure out the size of each block. */ unsigned int k_size = (current.kmax() - current.k0()); - // We need to insert padding at the end of each K section. - // The computation needed is a little delicate - the coordinates from the block walker are expressed in - // terms of the full, padded, _Ktotal. - // But we need to transform each section with reference to the original, unpadded, input, letting the - // transform pad each section as needed. + if (_Ksections > 1) { + // We need to insert padding at the end of each K section. + // The computation needed is a little delicate - the coordinates from the block walker are expressed in + // terms of the full, padded, _Ktotal. + // But we need to transform each section with reference to the original, unpadded, input, letting the + // transform pad each section as needed. - // This is needed for computations below. - const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll()); + // This is needed for computations below. + const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll()); - // The expected output format is also an entire columns interleaved, then the next set of - // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at - // a time. - for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){ - unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax()); + // The expected output format is also an entire columns interleaved, then the next set of + // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at + // a time. + for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ) { + unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax()); - // Track where we are and how much work is left. - unsigned int kpos = current.k0(); - unsigned int kleft = k_size; + // Track where we are and how much work is left. + unsigned int kpos = current.k0(); + unsigned int kleft = k_size; - while (kleft) { - // Which section are we in? Based on the rounded-up section size. - unsigned int k_section_base = kpos / rounded_section_size; - // How far into the section are we? - unsigned int k_offset = kpos - (k_section_base * rounded_section_size); + while (kleft) { + // Which section are we in? Based on the rounded-up section size. + unsigned int k_section_base = kpos / rounded_section_size; + // How far into the section are we? + unsigned int k_offset = kpos - (k_section_base * rounded_section_size); - // We will either copy the rest of this section, or to the end of the requested length. - unsigned int k_length = std::min(_Ksize - k_offset, kleft); + // We will either copy the rest of this section, or to the end of the requested length. + unsigned int k_length = std::min(_Ksize - k_offset, kleft); - strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, - x0, xmax, - (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. - (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. + strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, + x0, xmax, + (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. + (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. - // We need to modify our position based on the ROUNDED version of what we just did. - unsigned int padded_length = roundup(k_length, strategy::k_unroll()); + // We need to modify our position based on the ROUNDED version of what we just did. + unsigned int padded_length = roundup(k_length, strategy::k_unroll()); - buffer += strategy::out_width() * padded_length; + buffer += strategy::out_width() * padded_length; - kpos += padded_length; - kleft -= padded_length; + kpos += padded_length; + kleft -= padded_length; + } } + } else { + // In the single K section case, can process the whole lot in one go. + // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize. + strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, + current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize)); + buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll()); } } while (current.advance()); } @@ -1019,12 +1027,15 @@ public: } // Estimate cycles for given problem given provided parameters - static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { + template + static uint64_t estimate_cycles(const GemmArgs &args) { unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args)); + const PerformanceParameters ¶ms = strategy::template get_performance_parameters(args._ci); + uint64_t total_macs = static_cast(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args); uint64_t prepare_bytes = static_cast(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Toi); - uint64_t merge_bytes = static_cast(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr); + uint64_t merge_bytes = static_cast(args._nbatches) * args._nmulti * k_blocks * args._Msize * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr); float mac_cycles = static_cast(total_macs) / params.kernel_macs_cycle; float prepare_cycles = static_cast(prepare_bytes) / params.prepare_bytes_cycle; @@ -1042,6 +1053,17 @@ public: return static_cast(total_cycles); } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_INTERLEAVED; + c.inner_block_size = _k_block; + c.outer_block_size = _x_block; + c.filter = get_type_name(); + + return c; + } }; // Aliases for the variations diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp index 985567f6f3..aa62815438 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp @@ -29,15 +29,21 @@ #include "kernels/a64_gemm_s8_4x4.hpp" #include "kernels/a64_gemm_s8_8x12.hpp" #include "kernels/a64_hybrid_s8qa_dot_4x16.hpp" +#include "kernels/a64_hybrid_s8qa_mmla_4x16.hpp" #include "kernels/a64_hybrid_s8qs_dot_6x16.hpp" +#include "kernels/a64_hybrid_s8qs_mmla_6x16.hpp" #include "kernels/a64_hybrid_s8s32_dot_6x16.hpp" +#include "kernels/a64_hybrid_s8s32_mmla_6x16.hpp" #include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp" -#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp" #include "kernels/sve_hybrid_s8qa_dot_4x4VL.hpp" +#include "kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp" #include "kernels/sve_hybrid_s8qs_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp" +#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp" #include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp" #include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp" #include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp" @@ -54,62 +60,98 @@ namespace arm_gemm { static const GemmImplementation gemm_qint8_methods[] = { #ifdef ARM_COMPUTE_ENABLE_SVE -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "sve_hybrid_s8qa_mmla_4x4VL", + [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "sve_hybrid_s8qs_mmla_6x4VL", + [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_mmla_8x3VL", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, -#endif // ARM_COMPUTE_ENABLE_I8MM +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "sve_hybrid_s8s32_mmla_6x4VL", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), { GemmMethod::GEMM_HYBRID_QUANTIZED, "sve_smallK_hybrid_s8s32_dot_8x1VL", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + nullptr, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, -#ifdef ARM_COMPUTE_ENABLE_SVE2 -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_s8qs_dot_6x4VL", [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_symmetric(qp); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_s8qa_dot_4x4VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } -}, -#endif // ARM_COMPUTE_ENABLE_SVE2 -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_s8s32_dot_6x4VL", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_dot_8x3VL", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, -#endif // SVE -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +), +#endif // ARM_COMPUTE_ENABLE_SVE +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_s8qa_mmla_4x16", + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_s8qs_mmla_6x16", + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_symmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_s8s32_mmla_8x12", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, -#endif // ARM_COMPUTE_ENABLE_I8MM +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "a64_hybrid_s8s32_mmla_6x16", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), { GemmMethod::GEMM_HYBRID_QUANTIZED, "a64_smallK_hybrid_s8s32_dot_8x4", @@ -135,42 +177,42 @@ GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_s8qs_dot_6x16", [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); }, - [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_s8qs_dot_6x16::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } ), GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_s8qa_dot_4x16", [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); }, - [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_s8qa_dot_4x16::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } ), GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_s8s32_dot_6x16", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_s8s32_dot_6x16::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } ), GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_s8_8x12", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args, cls_a64_gemm_s8_8x12::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } ), -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_s8_4x4", nullptr, - [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, +), { GemmMethod::QUANTIZE_WRAPPER, "quantized_wrapper", [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; }, - [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); }, + [](const GemmArgs &, const Requantize32 &) { return false; }, [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper(args, qp); } }, { diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp index f3f2f335fd..abd2799583 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp @@ -29,13 +29,17 @@ #include "kernels/a64_gemm_u8_4x4.hpp" #include "kernels/a64_gemm_u8_8x12.hpp" #include "kernels/a64_hybrid_u8qa_dot_4x16.hpp" +#include "kernels/a64_hybrid_u8qa_mmla_4x16.hpp" #include "kernels/a64_hybrid_u8u32_dot_6x16.hpp" +#include "kernels/a64_hybrid_u8u32_mmla_6x16.hpp" #include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp" -#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp" #include "kernels/sve_hybrid_u8qa_dot_4x4VL.hpp" +#include "kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp" +#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp" #include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp" #include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp" #include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp" @@ -51,55 +55,77 @@ namespace arm_gemm { static const GemmImplementation gemm_quint8_methods[] = { #ifdef ARM_COMPUTE_ENABLE_SVE -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "sve_hybrid_u8qa_mmla_4x4VL", + [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_mmla_8x3VL", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, -#endif +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "sve_hybrid_u8u32_mmla_6x4VL", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), { GemmMethod::GEMM_HYBRID_QUANTIZED, "sve_smallK_hybrid_u8u32_dot_8x1VL", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + nullptr, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, -#ifdef ARM_COMPUTE_ENABLE_SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL) -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_u8qa_dot_4x4VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } -}, -#endif // ARM_COMPUTE_ENABLE_SVE2 -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_u8u32_dot_6x4VL", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_dot_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, -#endif -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +), +#endif // ARM_COMPUTE_ENABLE_SVE +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_u8qa_mmla_4x16", + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_u8u32_mmla_8x12", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); }, - [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, -#endif +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "a64_hybrid_u8u32_mmla_6x16", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +), { GemmMethod::GEMM_HYBRID_QUANTIZED, "a64_smallK_hybrid_u8u32_dot_8x4", @@ -125,35 +151,35 @@ GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_u8qa_dot_4x16", [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); }, - [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_u8qa_dot_4x16::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } ), GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_u8u32_dot_6x16", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_u8u32_dot_6x16::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } ), GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_u8_8x12", [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args, cls_a64_gemm_u8_8x12::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } ), -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_u8_4x4", nullptr, - [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); }, + [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized::estimate_cycles(args); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } -}, +), { GemmMethod::QUANTIZE_WRAPPER, "quantized_wrapper", [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; }, - [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); }, + [](const GemmArgs &, const Requantize32 &) { return false; }, [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper(args, qp); } }, { diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index 4c05fd1b73..75d6b362cc 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -34,11 +34,13 @@ #include "kernels/a64_gemm_u8_4x4.hpp" #include "kernels/a64_gemm_u8_8x12.hpp" #include "kernels/a64_hybrid_u8u32_dot_6x16.hpp" +#include "kernels/a64_hybrid_u8u32_mmla_6x16.hpp" #include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp" #include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp" #include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp" #include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp" #include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp" @@ -47,46 +49,56 @@ namespace arm_gemm { static const GemmImplementation gemm_u8_methods[] = { #ifdef ARM_COMPUTE_ENABLE_SVE -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "sve_hybrid_u8u32_mmla_6x4VL", + [](const GemmArgs &args) { return args._ci->has_svei8mm(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_mmla_8x3VL", [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif +), { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_u8u32_dot_8x1VL", + "sve_smallK_hybrid_u8u32_dot_8x1VL", [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + nullptr, [](const GemmArgs &args) { return new GemmHybrid(args); } }, -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "sve_hybrid_u8u32_dot_6x4VL", [](const GemmArgs &args) { return args._ci->has_sve(); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_dot_8x3VL", [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, - [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif -#ifdef ARM_COMPUTE_ENABLE_I8MM -{ +), +#endif // ARM_COMPUTE_ENABLE_SVE +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_u8u32_mmla_8x12", [](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -#endif +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_HYBRID, + "a64_hybrid_u8u32_mmla_6x16", + [](const GemmArgs &args) { return args._ci->has_i8mm(); }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +), { GemmMethod::GEMM_HYBRID, "a64_smallK_hybrid_u8u32_dot_8x4", @@ -108,27 +120,27 @@ static const GemmImplementation gemm_u8_methods[] = { [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; }, [](const GemmArgs &args) { return new GemmInterleaved(args); }, }, -{ +GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, "a64_hybrid_u8u32_dot_6x16", [](const GemmArgs &args) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_u8_8x12", [](const GemmArgs &args) { return args._ci->has_dotprod(); }, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, -{ +), +GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "a64_gemm_u8_4x4", nullptr, - nullptr, + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleaved(args); } -}, +), { GemmMethod::DEFAULT, "", diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index 12216009d2..4fc9b3456a 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -95,6 +95,18 @@ public: void set_pretransposed_B_data(void *buffer) override { _subgemm->set_pretransposed_B_data(buffer); } + + GemmConfig get_config() override { + GemmConfig c = _subgemm->get_config(); + + std::string new_filter = "gemv_batched["; + new_filter.append(c.filter); + new_filter.append("]"); + + c.filter = new_filter; + + return c; + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 9de44fcb73..d4348beabf 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,12 +36,55 @@ namespace arm_gemm { +namespace { + +template +class run_gemv_kernel { +public: + template + static void run ( + const strategy &strat, + const To *A_ptr, const To *B_ptr, Tr *c_ptr, + size_t N, size_t K, + const Tr *bias, const Activation &act, bool Accumulate, + const OutputStage &os, const int32_t *col_bias, unsigned int col_base + ); +}; + +template<> +template +void run_gemv_kernel::run( + const strategy &strat, + const To *A_ptr, const To *B_ptr, Tr *C_ptr, + size_t N, size_t K, + const Tr *bias, const Activation &act, bool Accumulate, + const Nothing &, const int32_t *, unsigned int + ) { + + strat.kernel(A_ptr, B_ptr, C_ptr, N, K, bias, act, Accumulate); +} + +template<> +template +void run_gemv_kernel::run( + const strategy &strat, + const To *A_ptr, const To *B_ptr, Tr *C_ptr, + size_t N, size_t K, + const Tr *, const Activation &, bool, + const Requantize32 &qp, const int32_t *col_bias, unsigned int col_base + ) { + + strat.kernel(A_ptr, B_ptr, C_ptr, N, K, &qp, col_bias + col_base, col_base); +} + +} // anonymous namespace + // Implementation of the GemmCommon abstract class. // // This is implementation is for GEMV with pretransposition. // // batches are not supported as a batched GEMV makes no sense (can be converted to a GEMM). -template +template class GemvPretransposed : public GemmCommon { typedef typename strategy::operand_type Toi; typedef typename strategy::result_type Tri; @@ -55,13 +98,28 @@ class GemvPretransposed : public GemmCommon { const Toi *_B_pretransposed = nullptr; + OutputStage _os; + + // Pointer to the column sums (for quantized cases) + int32_t *col_bias = nullptr; + + // Get size of the column sums + unsigned int get_col_sum_size() const { + if(std::is_same::value) { + return _args._Nsize * _args._nmulti * sizeof(int32_t); + } else { + return 0; + } + } + public: GemvPretransposed(GemvPretransposed &) = delete; GemvPretransposed & operator= (GemvPretransposed &) = delete; - GemvPretransposed(const GemmArgs &args) + GemvPretransposed(const GemmArgs &args, const OutputStage &os = {}) : _args(args), - _buffer_per_multi(args._Ksize * roundup(args._Nsize, strategy::out_width())) { + _buffer_per_multi(roundup(args._Ksize, strategy::k_unroll()) * roundup(args._Nsize, strategy::out_width())), + _os(os) { /* For now don't do any blocking. TODO: figure out if we should. */ if (strategy::supports_accumulate() && args._cfg && args._cfg->inner_block_size) { k_block = args._cfg->inner_block_size; @@ -117,12 +175,13 @@ public: #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n)); #endif - strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0, + run_gemv_kernel::run(strat, this->_Aptr + (multi * this->_A_multi_stride) + k0, _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()), this->_Cptr + (multi * this->_C_multi_stride) + n, (nmax - n), (kmax-k0), this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr, - _args._act, (k0 != 0)); + _args._act, (k0 != 0), + _os, col_bias, n + (_args._Nsize * multi)); } } } @@ -139,11 +198,26 @@ public: } size_t get_B_pretransposed_array_size() const override { - return _buffer_per_multi * _args._nmulti * sizeof(To); + return _buffer_per_multi * _args._nmulti * sizeof(To) + get_col_sum_size(); } void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override { - Toi *B_buffer = reinterpret_cast(buffer); + // Column sums go on the front of the pretransposed buffer in requantized cases. + // We could optimize here in case we don't actually need to sum the columns, but this code is only run on setup. + if (std::is_same::value) { + col_bias = reinterpret_cast(buffer); + + Requantize32 *qp_ptr = reinterpret_cast(&_os); + + for (unsigned int i=0; i<_args._nmulti; i++) { + compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _args._Nsize), _args._Ksize, i, 0); + } + } + + // The actual transposed buffer goes after the column sums (if any) + uintptr_t buffer_int = reinterpret_cast(buffer); + Toi *B_buffer = reinterpret_cast(buffer_int + get_col_sum_size()); + strategy strat(_args._ci); for (unsigned int multi=0; multi<_args._nmulti; multi++) { @@ -156,6 +230,17 @@ public: void set_pretransposed_B_data(void *buffer) override { _B_pretransposed = reinterpret_cast(buffer); } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMV_PRETRANSPOSED; + c.inner_block_size = k_block; + c.outer_block_size = n_block; + c.filter = get_type_name(); + + return c; + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp new file mode 100644 index 0000000000..533682c647 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 4, VLType::None, false>( + bfloat16 * &out_ptr, const float * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #2\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #2\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #2\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #2\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #2\n" + "add x22, x22, %x[row_offset], LSL #2\n" + "add x21, x21, %x[row_offset], LSL #2\n" + "add x20, x20, %x[row_offset], LSL #2\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q23, [x27], #0x10\n" + ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" + "ldr q22, [x26], #0x10\n" + "subs %x[width], %x[width], #0x4\n" + ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n" + "ldr q21, [x25], #0x10\n" + "cmp %x[width], #0x4\n" + ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" + "ldr q20, [x24], #0x10\n" + "ldr q18, [x23], #0x10\n" + ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n" + "ldr q19, [x22], #0x10\n" + ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" + "ldr q16, [x21], #0x10\n" + "ldr q17, [x20], #0x10\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "prfm pldl1keep, [x27, #0x70]\n" + "prfm pldl1keep, [x26, #0x70]\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "prfm pldl1keep, [x25, #0x70]\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "prfm pldl1keep, [x24, #0x70]\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "prfm pldl1keep, [x21, #0x70]\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q23, [%x[out_ptr], #0x0]\n" + "str q21, [%x[out_ptr], #0x10]\n" + "str q18, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 6f\n" + "tbz %x[width], #1, 4f\n" + "ldr d23, [x27], #0x8\n" + "ldr d22, [x26], #0x8\n" + "mov x19, #0x1\n" + "ldr d21, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d16, [x21], #0x8\n" + "ldr d17, [x20], #0x8\n" + "tbz %x[width], #0, 5f\n" + "ld1 { v23.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x26]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v20.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v16.s }[2], [x21]\n" + "ld1 { v17.s }[2], [x20]\n" + "b 5f\n" + "4:" // odd_loads_1_0 + "ldr s23, [x27, #0x0]\n" + "mov x19, #0x1\n" + "ldr s22, [x26, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s20, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s16, [x21, #0x0]\n" + "ldr s17, [x20, #0x0]\n" + "5:" // Odd load end + ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" + ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" + ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q21, [%x[out_ptr], #0x10]\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q18, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "6:" // Odds skip + + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp index 52b49c0f0c..b13d32c324 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp @@ -40,6 +40,7 @@ #include "a64_interleave8_block2_bf16_bf16.hpp" #include "a64_interleave8_block2_fp32_fp32.hpp" #include "a64_interleave8_block4_bf16_bf16.hpp" +#include "a64_interleave8_block4_fp32_bf16.hpp" #include "a64_interleave8_block4_s8_s8.hpp" #include "a64_interleave8_block4_s8_s8_summing.hpp" #include "a64_interleave8_block4_u8_u8_summing.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp index a6b1269927..d5003e4a19 100644 --- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp +++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -348,6 +348,10 @@ template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_ template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); #endif // ARM_COMPUTE_ENABLE_BF16 /* Arm® Neonâ„¢/SVE using FP32 kernel */ @@ -375,12 +379,10 @@ template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * co template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); -#ifdef ARM_COMPUTE_ENABLE_I8MM /* MMLA SMMLA (height 8, block 8) */ template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); -#endif // ARM_COMPUTE_ENABLE_I8MM /* Arm® Neonâ„¢ SDOT (height 8, block 1) */ template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); @@ -397,12 +399,10 @@ template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); -#ifdef ARM_COMPUTE_ENABLE_I8MM /* MMLA SMMLA (height 8, block 8) */ template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); -#endif // ARM_COMPUTE_ENABLE_I8MM /* Arm® Neonâ„¢ 16-bit (height 8, block 1) */ template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp index b68a5f518a..b7af7110ab 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #ifdef __aarch64__ #include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" namespace arm_gemm { @@ -58,6 +59,28 @@ public: StdTransformsFixed transforms = {}; StdTransformsFixed transforms_quantized = {}; + template + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A510: + return { 3.32, 2.56, 2.63 }; + + default: + return { 7.97, 3.72, 7.31 }; + } + } + + if (std::is_same::value) { + switch(ci->get_cpu_model()) { + case CPUModel::A510: + return { 3.33, 2.89, 0.09 }; + default: + return { 7.97, 3.74, 0.34 }; + } + } + } + kern_type kernel=a64_gemm_s8_4x4; cls_a64_gemm_s8_4x4(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp index 7c7b894b08..83ccb4681b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp @@ -61,13 +61,29 @@ public: StdTransformsFixed transforms = {}; StdTransformsFixed transforms_quantized = {}; + template static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 15.361, 0.9341, 0.1636 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A510: + return { 19.73, 2.81, 0.27 }; - default: - return { 29.0698, 3.9793, 0.4003 }; + case CPUModel::A55r1: + return { 15.361, 0.9341, 0.1636 }; + + default: + return { 29.0698, 3.9793, 0.4003 }; + } + } + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A510: + return { 19.73, 3.41, 3.70 }; + + default: + return { 31.81, 3.68, 8.01 }; + } } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp index 854b6751c1..07c4769479 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #ifdef __aarch64__ +#include "../performance_parameters.hpp" #include "../std_transforms_fixed.hpp" namespace arm_gemm { @@ -66,6 +67,30 @@ public: StdTransformsFixed transforms = {}; StdTransformsFixed transforms_quantized = {}; + template + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A510: + return { 2.64, 2.72, 2.64 }; + + default: + return { 7.95, 3.76, 7.27 }; + } + } + + if (std::is_same::value) { + switch(ci->get_cpu_model()) { + case CPUModel::A510: + return { 2.64, 1.79, 0.10 }; + default: + return { 7.95, 4.09, 0.33 }; + } + } + + return { 0.0 }; + } + kern_type kernel = a64_gemm_u8_4x4; cls_a64_gemm_u8_4x4(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp index 00ed5d03bf..0329f57615 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2018,2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -69,14 +69,38 @@ public: StdTransformsFixed transforms = {}; StdTransformsFixed transforms_quantized = {}; + template static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 15.361, 0.9341, 0.1636 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A510: + return { 19.73, 3.38, 0.27 }; - default: - return { 29.0698, 3.9793, 0.4003 }; + case CPUModel::A55r1: + return { 15.361, 0.9341, 0.1636 }; + + case CPUModel::V1: + return { 62.40, 4.71, 0.67 }; + + default: + return { 29.0698, 3.9793, 0.4003 }; + } } + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A510: + return { 19.73, 3.38, 3.70 }; + + case CPUModel::V1: + return { 61.58, 4.78, 10.83 }; + + default: + return { 31.82, 3.51, 8.03 }; + } + } + + return { 0.0 }; } kern_type kernel = a64_gemm_u8_8x12; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp deleted file mode 100644 index b53172509e..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../performance_parameters.hpp" -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_gemv_fp32_mla_32(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool); - -class cls_a64_gemv_fp32_mla_32 -{ -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool); - - static unsigned int out_width() - { - return 32; - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_gemv_fp32_mla_32; - - cls_a64_gemv_fp32_mla_32(const CPUInfo *) - { - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp deleted file mode 100644 index 51a9641af5..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp +++ /dev/null @@ -1,1547 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include "arm_gemm.hpp" -#include "../../utils.hpp" - -#include -#include - -namespace arm_gemm { - -void a64_gemv_fp32_mla_32 ( - const float *A_ptr, const float *B_ptr, float *output_ptr, - size_t N, size_t K, - const float *bias, Activation act, bool -) -{ - struct KernelArgs { - float maxval = static_cast(std::numeric_limits::infinity()); - float minval = - static_cast(std::numeric_limits::infinity()); - const float *B_ptr = {}; - size_t output_offset = {}; - unsigned int input_initial_col = {}; - } ka; - - unsigned long flags=0; - ka.B_ptr = B_ptr; - switch(act.type) { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - ka.maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - ka.minval = 0; - flags |= 0x2; - break; - } - __asm__ __volatile__( - "add x22, %x[N], #0x3\n" - "mov x21, %x[bias]\n" - "lsr x22, x22, #0x2\n" - "1:" // Column loop - "cmp x22, #0x8\n" - "bge 85f\n" - "cmp x22, #0x6\n" - "bgt 73f\n" - "beq 61f\n" - "cmp x22, #0x4\n" - "bgt 49f\n" - "beq 37f\n" - "cmp x22, #0x2\n" - "bgt 25f\n" - "beq 13f\n" - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 2f\n" - "ldr q24, [x21, #0x0]\n" - "add x21, x21, #0x10\n" - "b 3f\n" - "2:" // Width 1: no bias - "movi v24.16b, #0x0\n" - "3:" // Width 1: setup done - "cmp x20, #0x4\n" - "blt 6f\n" - "cmp x20, #0x8\n" - "blt 5f\n" - "4:" // Width 1: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q2, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v2.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q3, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v3.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q4, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v4.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x20, x20, #0x4\n" - "prfm pldl1keep, [x19, #0x80]\n" - "cmp x20, #0x8\n" - "bge 4b\n" - "5:" // Width 1: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q5, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v5.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q6, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v6.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q7, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v7.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q8, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v8.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "6:" // Width 1: Multiply loop: Main loop skip - "cbz x20, 8f\n" - "7:" // Width 1: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q9, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v9.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "sub x20, x20, #0x1\n" - "cbnz x20, 7b\n" - "8:" // Width 1: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 9f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "9:" // Width 1: No activation - "cmp %x[N], #0x4\n" - "blt 10f\n" - "str q24, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 12f\n" - "10:" // Width 1: Partial writeback - "tbz %x[N], #1, 11f\n" - "str d24, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 12f\n" - "st1 { v24.s }[2], [%x[output_ptr]]\n" - "b 12f\n" - "11:" // Width 1: Partial direct writeback: partial_1_0 - "str s24, [%x[output_ptr], #0x0]\n" - "12:" // Width 1: Writeback done - "b 97f\n" - "13:" // Width 2 - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 14f\n" - "ldr q24, [x21, #0x0]\n" - "ldr q25, [x21, #0x10]\n" - "add x21, x21, #0x20\n" - "b 15f\n" - "14:" // Width 2: no bias - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "15:" // Width 2: setup done - "cmp x20, #0x4\n" - "blt 18f\n" - "cmp x20, #0x8\n" - "blt 17f\n" - "16:" // Width 2: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x10]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v25.4s, v2.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q3, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v3.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q4, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v4.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q5, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v5.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q6, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v6.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q7, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v7.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q8, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v8.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x20, x20, #0x4\n" - "prfm pldl1keep, [x19, #0x80]\n" - "cmp x20, #0x8\n" - "bge 16b\n" - "17:" // Width 2: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q9, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v9.4s, v0.s[0]\n" - "ldr q10, [%x[B_ptr], #0x10]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v25.4s, v10.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q11, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v11.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q12, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v12.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q13, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v13.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q14, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v14.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q15, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q16, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v16.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "18:" // Width 2: Multiply loop: Main loop skip - "cbz x20, 20f\n" - "19:" // Width 2: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q17, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v17.4s, v0.s[0]\n" - "ldr q18, [%x[B_ptr], #0x10]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v25.4s, v18.4s, v0.s[0]\n" - "sub x20, x20, #0x1\n" - "cbnz x20, 19b\n" - "20:" // Width 2: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 21f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmin v25.4s, v25.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "21:" // Width 2: No activation - "str q24, [%x[output_ptr], #0x0]\n" - "cmp %x[N], #0x8\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "blt 22f\n" - "str q25, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 24f\n" - "22:" // Width 2: Partial writeback - "tbz %x[N], #1, 23f\n" - "str d25, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 24f\n" - "st1 { v25.s }[2], [%x[output_ptr]]\n" - "b 24f\n" - "23:" // Width 2: Partial direct writeback: partial_1_4 - "tbz %x[N], #0, 24f\n" - "str s25, [%x[output_ptr], #0x0]\n" - "24:" // Width 2: Writeback done - "b 97f\n" - "25:" // Width 3 - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 26f\n" - "ldr q24, [x21, #0x0]\n" - "ldr q25, [x21, #0x10]\n" - "ldr q26, [x21, #0x20]\n" - "add x21, x21, #0x30\n" - "b 27f\n" - "26:" // Width 3: no bias - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "27:" // Width 3: setup done - "cmp x20, #0x4\n" - "blt 30f\n" - "cmp x20, #0x8\n" - "blt 29f\n" - "28:" // Width 3: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x10]\n" - "ldr q3, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v2.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v26.4s, v3.4s, v0.s[0]\n" - "ldr q4, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v4.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q5, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v5.4s, v0.s[1]\n" - "ldr q6, [%x[B_ptr], #0x20]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v6.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q7, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v7.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q8, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v8.4s, v0.s[2]\n" - "ldr q9, [%x[B_ptr], #0x20]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v9.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q10, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v10.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q11, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v11.4s, v0.s[3]\n" - "ldr q12, [%x[B_ptr], #0x20]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v12.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x20, x20, #0x4\n" - "prfm pldl1keep, [x19, #0x80]\n" - "cmp x20, #0x8\n" - "bge 28b\n" - "29:" // Width 3: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q13, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v13.4s, v0.s[0]\n" - "ldr q14, [%x[B_ptr], #0x10]\n" - "ldr q15, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v14.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v26.4s, v15.4s, v0.s[0]\n" - "ldr q16, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q17, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v17.4s, v0.s[1]\n" - "ldr q18, [%x[B_ptr], #0x20]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v18.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q19, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v19.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q20, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v20.4s, v0.s[2]\n" - "ldr q21, [%x[B_ptr], #0x20]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v21.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q22, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v22.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q23, [%x[B_ptr], #0x10]\n" - "fmla v25.4s, v23.4s, v0.s[3]\n" - "ldr q1, [%x[B_ptr], #0x20]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v1.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "30:" // Width 3: Multiply loop: Main loop skip - "cbz x20, 32f\n" - "31:" // Width 3: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q2, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v2.4s, v0.s[0]\n" - "ldr q3, [%x[B_ptr], #0x10]\n" - "ldr q4, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v3.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v4.4s, v0.s[0]\n" - "sub x20, x20, #0x1\n" - "cbnz x20, 31b\n" - "32:" // Width 3: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 33f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmin v25.4s, v25.4s, v16.4s\n" - "fmin v26.4s, v26.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "33:" // Width 3: No activation - "str q24, [%x[output_ptr], #0x0]\n" - "str q25, [%x[output_ptr], #0x10]\n" - "cmp %x[N], #0xc\n" - "add %x[output_ptr], %x[output_ptr], #0x20\n" - "blt 34f\n" - "str q26, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 36f\n" - "34:" // Width 3: Partial writeback - "tbz %x[N], #1, 35f\n" - "str d26, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 36f\n" - "st1 { v26.s }[2], [%x[output_ptr]]\n" - "b 36f\n" - "35:" // Width 3: Partial direct writeback: partial_1_8 - "tbz %x[N], #0, 36f\n" - "str s26, [%x[output_ptr], #0x0]\n" - "36:" // Width 3: Writeback done - "b 97f\n" - "37:" // Width 4 - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 38f\n" - "ldr q24, [x21, #0x0]\n" - "ldr q25, [x21, #0x10]\n" - "ldr q26, [x21, #0x20]\n" - "ldr q27, [x21, #0x30]\n" - "add x21, x21, #0x40\n" - "b 39f\n" - "38:" // Width 4: no bias - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "39:" // Width 4: setup done - "cmp x20, #0x4\n" - "blt 42f\n" - "cmp x20, #0x8\n" - "blt 41f\n" - "40:" // Width 4: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x10]\n" - "ldr q3, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v2.4s, v0.s[0]\n" - "ldr q4, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v3.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v4.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q5, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v5.4s, v0.s[1]\n" - "ldr q6, [%x[B_ptr], #0x10]\n" - "ldr q7, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v6.4s, v0.s[1]\n" - "ldr q8, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v7.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v8.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q9, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v9.4s, v0.s[2]\n" - "ldr q10, [%x[B_ptr], #0x10]\n" - "ldr q11, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v10.4s, v0.s[2]\n" - "ldr q12, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v11.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v12.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q13, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v13.4s, v0.s[3]\n" - "ldr q14, [%x[B_ptr], #0x10]\n" - "ldr q15, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v14.4s, v0.s[3]\n" - "ldr q16, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v15.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v16.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [x19, #0x80]\n" - "sub x20, x20, #0x4\n" - "cmp x20, #0x8\n" - "bge 40b\n" - "41:" // Width 4: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q17, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v17.4s, v0.s[0]\n" - "ldr q18, [%x[B_ptr], #0x10]\n" - "ldr q19, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v18.4s, v0.s[0]\n" - "ldr q20, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v19.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v20.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q21, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v21.4s, v0.s[1]\n" - "ldr q22, [%x[B_ptr], #0x10]\n" - "ldr q23, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v22.4s, v0.s[1]\n" - "ldr q1, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v23.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v1.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q2, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v2.4s, v0.s[2]\n" - "ldr q3, [%x[B_ptr], #0x10]\n" - "ldr q4, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v3.4s, v0.s[2]\n" - "ldr q5, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v4.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v5.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q6, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v6.4s, v0.s[3]\n" - "ldr q7, [%x[B_ptr], #0x10]\n" - "ldr q8, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v7.4s, v0.s[3]\n" - "ldr q9, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v8.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v9.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add x19, x19, #0x10\n" - "prfm pldl1keep, [x19, #0x80]\n" - "42:" // Width 4: Multiply loop: Main loop skip - "cbz x20, 44f\n" - "43:" // Width 4: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q10, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v10.4s, v0.s[0]\n" - "ldr q11, [%x[B_ptr], #0x10]\n" - "ldr q12, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v11.4s, v0.s[0]\n" - "ldr q13, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v12.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "sub x20, x20, #0x1\n" - "fmla v27.4s, v13.4s, v0.s[0]\n" - "cbnz x20, 43b\n" - "44:" // Width 4: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 45f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmin v25.4s, v25.4s, v16.4s\n" - "fmin v26.4s, v26.4s, v16.4s\n" - "fmin v27.4s, v27.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "45:" // Width 4: No activation - "str q24, [%x[output_ptr], #0x0]\n" - "str q25, [%x[output_ptr], #0x10]\n" - "str q26, [%x[output_ptr], #0x20]\n" - "cmp %x[N], #0x10\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "blt 46f\n" - "str q27, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 48f\n" - "46:" // Width 4: Partial writeback - "tbz %x[N], #1, 47f\n" - "str d27, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 48f\n" - "st1 { v27.s }[2], [%x[output_ptr]]\n" - "b 48f\n" - "47:" // Width 4: Partial direct writeback: partial_1_12 - "tbz %x[N], #0, 48f\n" - "str s27, [%x[output_ptr], #0x0]\n" - "48:" // Width 4: Writeback done - "b 97f\n" - "49:" // Width 5 - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 50f\n" - "ldr q24, [x21, #0x0]\n" - "ldr q25, [x21, #0x10]\n" - "ldr q26, [x21, #0x20]\n" - "ldr q27, [x21, #0x30]\n" - "ldr q28, [x21, #0x40]\n" - "add x21, x21, #0x50\n" - "b 51f\n" - "50:" // Width 5: no bias - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "51:" // Width 5: setup done - "cmp x20, #0x4\n" - "blt 54f\n" - "cmp x20, #0x8\n" - "blt 53f\n" - "52:" // Width 5: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x10]\n" - "ldr q3, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v2.4s, v0.s[0]\n" - "ldr q4, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v3.4s, v0.s[0]\n" - "ldr q5, [%x[B_ptr], #0x40]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v27.4s, v4.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q6, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v5.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q7, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v6.4s, v0.s[1]\n" - "ldr q8, [%x[B_ptr], #0x20]\n" - "ldr q9, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v7.4s, v0.s[1]\n" - "ldr q10, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v8.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v9.4s, v0.s[1]\n" - "ldr q11, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v10.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q12, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v11.4s, v0.s[2]\n" - "ldr q13, [%x[B_ptr], #0x20]\n" - "ldr q14, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v12.4s, v0.s[2]\n" - "ldr q15, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v13.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v14.4s, v0.s[2]\n" - "ldr q16, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v15.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q17, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v16.4s, v0.s[3]\n" - "ldr q18, [%x[B_ptr], #0x20]\n" - "ldr q19, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v17.4s, v0.s[3]\n" - "ldr q20, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v18.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v19.4s, v0.s[3]\n" - "add x19, x19, #0x10\n" - "fmla v28.4s, v20.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x20, x20, #0x4\n" - "prfm pldl1keep, [x19, #0x80]\n" - "cmp x20, #0x8\n" - "bge 52b\n" - "53:" // Width 5: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q21, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v21.4s, v0.s[0]\n" - "ldr q22, [%x[B_ptr], #0x10]\n" - "ldr q23, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v22.4s, v0.s[0]\n" - "ldr q1, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v23.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x40]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v27.4s, v1.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "ldr q3, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v2.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q4, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v3.4s, v0.s[1]\n" - "ldr q5, [%x[B_ptr], #0x20]\n" - "ldr q6, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v4.4s, v0.s[1]\n" - "ldr q7, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v5.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v6.4s, v0.s[1]\n" - "ldr q8, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v7.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q9, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v8.4s, v0.s[2]\n" - "ldr q10, [%x[B_ptr], #0x20]\n" - "ldr q11, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v9.4s, v0.s[2]\n" - "ldr q12, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v10.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v11.4s, v0.s[2]\n" - "ldr q13, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v12.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q14, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v13.4s, v0.s[3]\n" - "ldr q15, [%x[B_ptr], #0x20]\n" - "ldr q16, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v14.4s, v0.s[3]\n" - "ldr q17, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v15.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v16.4s, v0.s[3]\n" - "add x19, x19, #0x10\n" - "fmla v28.4s, v17.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "54:" // Width 5: Multiply loop: Main loop skip - "cbz x20, 56f\n" - "55:" // Width 5: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q18, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v18.4s, v0.s[0]\n" - "ldr q19, [%x[B_ptr], #0x10]\n" - "ldr q20, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v19.4s, v0.s[0]\n" - "ldr q21, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v20.4s, v0.s[0]\n" - "ldr q22, [%x[B_ptr], #0x40]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v27.4s, v21.4s, v0.s[0]\n" - "sub x20, x20, #0x1\n" - "fmla v28.4s, v22.4s, v0.s[0]\n" - "cbnz x20, 55b\n" - "56:" // Width 5: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 57f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmin v25.4s, v25.4s, v16.4s\n" - "fmin v26.4s, v26.4s, v16.4s\n" - "fmin v27.4s, v27.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmin v28.4s, v28.4s, v16.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" - "57:" // Width 5: No activation - "str q24, [%x[output_ptr], #0x0]\n" - "str q25, [%x[output_ptr], #0x10]\n" - "str q26, [%x[output_ptr], #0x20]\n" - "str q27, [%x[output_ptr], #0x30]\n" - "cmp %x[N], #0x14\n" - "add %x[output_ptr], %x[output_ptr], #0x40\n" - "blt 58f\n" - "str q28, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 60f\n" - "58:" // Width 5: Partial writeback - "tbz %x[N], #1, 59f\n" - "str d28, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 60f\n" - "st1 { v28.s }[2], [%x[output_ptr]]\n" - "b 60f\n" - "59:" // Width 5: Partial direct writeback: partial_1_16 - "tbz %x[N], #0, 60f\n" - "str s28, [%x[output_ptr], #0x0]\n" - "60:" // Width 5: Writeback done - "b 97f\n" - "61:" // Width 6 - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 62f\n" - "ldr q24, [x21, #0x0]\n" - "ldr q25, [x21, #0x10]\n" - "ldr q26, [x21, #0x20]\n" - "ldr q27, [x21, #0x30]\n" - "ldr q28, [x21, #0x40]\n" - "ldr q29, [x21, #0x50]\n" - "add x21, x21, #0x60\n" - "b 63f\n" - "62:" // Width 6: no bias - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "63:" // Width 6: setup done - "cmp x20, #0x4\n" - "blt 66f\n" - "cmp x20, #0x8\n" - "blt 65f\n" - "64:" // Width 6: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x10]\n" - "ldr q3, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v2.4s, v0.s[0]\n" - "ldr q4, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v3.4s, v0.s[0]\n" - "ldr q5, [%x[B_ptr], #0x40]\n" - "ldr q6, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v4.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v5.4s, v0.s[0]\n" - "ldr q7, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v6.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q8, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v7.4s, v0.s[1]\n" - "ldr q9, [%x[B_ptr], #0x20]\n" - "ldr q10, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v8.4s, v0.s[1]\n" - "ldr q11, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v9.4s, v0.s[1]\n" - "ldr q12, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v10.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v11.4s, v0.s[1]\n" - "ldr q13, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v12.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q14, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v13.4s, v0.s[2]\n" - "ldr q15, [%x[B_ptr], #0x20]\n" - "ldr q16, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v14.4s, v0.s[2]\n" - "ldr q17, [%x[B_ptr], #0x40]\n" - "ldr q18, [%x[B_ptr], #0x50]\n" - "fmla v26.4s, v15.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v16.4s, v0.s[2]\n" - "ldr q19, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v17.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q20, [%x[B_ptr], #0x10]\n" - "fmla v29.4s, v18.4s, v0.s[2]\n" - "ldr q21, [%x[B_ptr], #0x20]\n" - "ldr q22, [%x[B_ptr], #0x30]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q23, [%x[B_ptr], #0x40]\n" - "ldr q1, [%x[B_ptr], #0x50]\n" - "fmla v25.4s, v20.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v26.4s, v21.4s, v0.s[3]\n" - "add x19, x19, #0x10\n" - "fmla v27.4s, v22.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x20, x20, #0x4\n" - "fmla v28.4s, v23.4s, v0.s[3]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "cmp x20, #0x8\n" - "fmla v29.4s, v1.4s, v0.s[3]\n" - "bge 64b\n" - "65:" // Width 6: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q2, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v2.4s, v0.s[0]\n" - "ldr q3, [%x[B_ptr], #0x10]\n" - "ldr q4, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v3.4s, v0.s[0]\n" - "ldr q5, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v4.4s, v0.s[0]\n" - "ldr q6, [%x[B_ptr], #0x40]\n" - "ldr q7, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v5.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v6.4s, v0.s[0]\n" - "ldr q8, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q9, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v8.4s, v0.s[1]\n" - "ldr q10, [%x[B_ptr], #0x20]\n" - "ldr q11, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v9.4s, v0.s[1]\n" - "ldr q12, [%x[B_ptr], #0x40]\n" - "fmla v26.4s, v10.4s, v0.s[1]\n" - "ldr q13, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v11.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v12.4s, v0.s[1]\n" - "ldr q14, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v13.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q15, [%x[B_ptr], #0x10]\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "ldr q16, [%x[B_ptr], #0x20]\n" - "ldr q17, [%x[B_ptr], #0x30]\n" - "fmla v25.4s, v15.4s, v0.s[2]\n" - "ldr q18, [%x[B_ptr], #0x40]\n" - "ldr q19, [%x[B_ptr], #0x50]\n" - "fmla v26.4s, v16.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v17.4s, v0.s[2]\n" - "ldr q20, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v18.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q21, [%x[B_ptr], #0x10]\n" - "fmla v29.4s, v19.4s, v0.s[2]\n" - "ldr q22, [%x[B_ptr], #0x20]\n" - "ldr q23, [%x[B_ptr], #0x30]\n" - "fmla v24.4s, v20.4s, v0.s[3]\n" - "ldr q1, [%x[B_ptr], #0x40]\n" - "ldr q2, [%x[B_ptr], #0x50]\n" - "fmla v25.4s, v21.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v26.4s, v22.4s, v0.s[3]\n" - "add x19, x19, #0x10\n" - "fmla v27.4s, v23.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "fmla v28.4s, v1.4s, v0.s[3]\n" - "fmla v29.4s, v2.4s, v0.s[3]\n" - "66:" // Width 6: Multiply loop: Main loop skip - "cbz x20, 68f\n" - "67:" // Width 6: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q3, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v3.4s, v0.s[0]\n" - "ldr q4, [%x[B_ptr], #0x10]\n" - "ldr q5, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v4.4s, v0.s[0]\n" - "ldr q6, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v5.4s, v0.s[0]\n" - "ldr q7, [%x[B_ptr], #0x40]\n" - "ldr q8, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v6.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "sub x20, x20, #0x1\n" - "fmla v28.4s, v7.4s, v0.s[0]\n" - "fmla v29.4s, v8.4s, v0.s[0]\n" - "cbnz x20, 67b\n" - "68:" // Width 6: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 69f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmin v25.4s, v25.4s, v16.4s\n" - "fmin v26.4s, v26.4s, v16.4s\n" - "fmin v27.4s, v27.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmin v28.4s, v28.4s, v16.4s\n" - "fmin v29.4s, v29.4s, v16.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" - "fmax v29.4s, v29.4s, v17.4s\n" - "69:" // Width 6: No activation - "str q24, [%x[output_ptr], #0x0]\n" - "str q25, [%x[output_ptr], #0x10]\n" - "str q26, [%x[output_ptr], #0x20]\n" - "str q27, [%x[output_ptr], #0x30]\n" - "str q28, [%x[output_ptr], #0x40]\n" - "cmp %x[N], #0x18\n" - "add %x[output_ptr], %x[output_ptr], #0x50\n" - "blt 70f\n" - "str q29, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 72f\n" - "70:" // Width 6: Partial writeback - "tbz %x[N], #1, 71f\n" - "str d29, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 72f\n" - "st1 { v29.s }[2], [%x[output_ptr]]\n" - "b 72f\n" - "71:" // Width 6: Partial direct writeback: partial_1_20 - "tbz %x[N], #0, 72f\n" - "str s29, [%x[output_ptr], #0x0]\n" - "72:" // Width 6: Writeback done - "b 97f\n" - "73:" // Width 7 - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 74f\n" - "ldr q24, [x21, #0x0]\n" - "ldr q25, [x21, #0x10]\n" - "ldr q26, [x21, #0x20]\n" - "ldr q27, [x21, #0x30]\n" - "ldr q28, [x21, #0x40]\n" - "ldr q29, [x21, #0x50]\n" - "ldr q30, [x21, #0x60]\n" - "add x21, x21, #0x70\n" - "b 75f\n" - "74:" // Width 7: no bias - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "75:" // Width 7: setup done - "cmp x20, #0x4\n" - "blt 78f\n" - "cmp x20, #0x8\n" - "blt 77f\n" - "76:" // Width 7: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x10]\n" - "ldr q3, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v2.4s, v0.s[0]\n" - "ldr q4, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v3.4s, v0.s[0]\n" - "ldr q5, [%x[B_ptr], #0x40]\n" - "ldr q6, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v4.4s, v0.s[0]\n" - "ldr q7, [%x[B_ptr], #0x60]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v28.4s, v5.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v29.4s, v6.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q8, [%x[B_ptr], #0x0]\n" - "fmla v30.4s, v7.4s, v0.s[0]\n" - "ldr q9, [%x[B_ptr], #0x10]\n" - "ldr q10, [%x[B_ptr], #0x20]\n" - "fmla v24.4s, v8.4s, v0.s[1]\n" - "ldr q11, [%x[B_ptr], #0x30]\n" - "ldr q12, [%x[B_ptr], #0x40]\n" - "fmla v25.4s, v9.4s, v0.s[1]\n" - "ldr q13, [%x[B_ptr], #0x50]\n" - "fmla v26.4s, v10.4s, v0.s[1]\n" - "ldr q14, [%x[B_ptr], #0x60]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v27.4s, v11.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v12.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q15, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v13.4s, v0.s[1]\n" - "ldr q16, [%x[B_ptr], #0x10]\n" - "ldr q17, [%x[B_ptr], #0x20]\n" - "fmla v30.4s, v14.4s, v0.s[1]\n" - "ldr q18, [%x[B_ptr], #0x30]\n" - "fmla v24.4s, v15.4s, v0.s[2]\n" - "ldr q19, [%x[B_ptr], #0x40]\n" - "ldr q20, [%x[B_ptr], #0x50]\n" - "fmla v25.4s, v16.4s, v0.s[2]\n" - "ldr q21, [%x[B_ptr], #0x60]\n" - "fmla v26.4s, v17.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v18.4s, v0.s[2]\n" - "ldr q22, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v19.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q23, [%x[B_ptr], #0x10]\n" - "fmla v29.4s, v20.4s, v0.s[2]\n" - "ldr q1, [%x[B_ptr], #0x20]\n" - "ldr q2, [%x[B_ptr], #0x30]\n" - "fmla v30.4s, v21.4s, v0.s[2]\n" - "ldr q3, [%x[B_ptr], #0x40]\n" - "fmla v24.4s, v22.4s, v0.s[3]\n" - "ldr q4, [%x[B_ptr], #0x50]\n" - "ldr q5, [%x[B_ptr], #0x60]\n" - "fmla v25.4s, v23.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v1.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v2.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add x19, x19, #0x10\n" - "fmla v28.4s, v3.4s, v0.s[3]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "sub x20, x20, #0x4\n" - "fmla v29.4s, v4.4s, v0.s[3]\n" - "cmp x20, #0x8\n" - "fmla v30.4s, v5.4s, v0.s[3]\n" - "bge 76b\n" - "77:" // Width 7: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q6, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v6.4s, v0.s[0]\n" - "ldr q7, [%x[B_ptr], #0x10]\n" - "ldr q8, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v7.4s, v0.s[0]\n" - "ldr q9, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v8.4s, v0.s[0]\n" - "ldr q10, [%x[B_ptr], #0x40]\n" - "ldr q11, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v9.4s, v0.s[0]\n" - "ldr q12, [%x[B_ptr], #0x60]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v28.4s, v10.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v29.4s, v11.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q13, [%x[B_ptr], #0x0]\n" - "fmla v30.4s, v12.4s, v0.s[0]\n" - "ldr q14, [%x[B_ptr], #0x10]\n" - "ldr q15, [%x[B_ptr], #0x20]\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "ldr q16, [%x[B_ptr], #0x30]\n" - "ldr q17, [%x[B_ptr], #0x40]\n" - "fmla v25.4s, v14.4s, v0.s[1]\n" - "ldr q18, [%x[B_ptr], #0x50]\n" - "fmla v26.4s, v15.4s, v0.s[1]\n" - "ldr q19, [%x[B_ptr], #0x60]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v27.4s, v16.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v17.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q20, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v18.4s, v0.s[1]\n" - "ldr q21, [%x[B_ptr], #0x10]\n" - "ldr q22, [%x[B_ptr], #0x20]\n" - "fmla v30.4s, v19.4s, v0.s[1]\n" - "ldr q23, [%x[B_ptr], #0x30]\n" - "fmla v24.4s, v20.4s, v0.s[2]\n" - "ldr q1, [%x[B_ptr], #0x40]\n" - "ldr q2, [%x[B_ptr], #0x50]\n" - "fmla v25.4s, v21.4s, v0.s[2]\n" - "ldr q3, [%x[B_ptr], #0x60]\n" - "fmla v26.4s, v22.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v23.4s, v0.s[2]\n" - "ldr q4, [%x[B_ptr], #0x0]\n" - "fmla v28.4s, v1.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q5, [%x[B_ptr], #0x10]\n" - "fmla v29.4s, v2.4s, v0.s[2]\n" - "ldr q6, [%x[B_ptr], #0x20]\n" - "ldr q7, [%x[B_ptr], #0x30]\n" - "fmla v30.4s, v3.4s, v0.s[2]\n" - "ldr q8, [%x[B_ptr], #0x40]\n" - "fmla v24.4s, v4.4s, v0.s[3]\n" - "ldr q9, [%x[B_ptr], #0x50]\n" - "ldr q10, [%x[B_ptr], #0x60]\n" - "fmla v25.4s, v5.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v26.4s, v6.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v27.4s, v7.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "add x19, x19, #0x10\n" - "fmla v28.4s, v8.4s, v0.s[3]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "fmla v29.4s, v9.4s, v0.s[3]\n" - "fmla v30.4s, v10.4s, v0.s[3]\n" - "78:" // Width 7: Multiply loop: Main loop skip - "cbz x20, 80f\n" - "79:" // Width 7: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q11, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v11.4s, v0.s[0]\n" - "ldr q12, [%x[B_ptr], #0x10]\n" - "ldr q13, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v12.4s, v0.s[0]\n" - "ldr q14, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v13.4s, v0.s[0]\n" - "ldr q15, [%x[B_ptr], #0x40]\n" - "ldr q16, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v14.4s, v0.s[0]\n" - "ldr q17, [%x[B_ptr], #0x60]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "fmla v28.4s, v15.4s, v0.s[0]\n" - "fmla v29.4s, v16.4s, v0.s[0]\n" - "sub x20, x20, #0x1\n" - "fmla v30.4s, v17.4s, v0.s[0]\n" - "cbnz x20, 79b\n" - "80:" // Width 7: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 81f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmin v25.4s, v25.4s, v16.4s\n" - "fmin v26.4s, v26.4s, v16.4s\n" - "fmin v27.4s, v27.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmin v28.4s, v28.4s, v16.4s\n" - "fmin v29.4s, v29.4s, v16.4s\n" - "fmin v30.4s, v30.4s, v16.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" - "fmax v29.4s, v29.4s, v17.4s\n" - "fmax v30.4s, v30.4s, v17.4s\n" - "81:" // Width 7: No activation - "str q24, [%x[output_ptr], #0x0]\n" - "str q25, [%x[output_ptr], #0x10]\n" - "str q26, [%x[output_ptr], #0x20]\n" - "str q27, [%x[output_ptr], #0x30]\n" - "str q28, [%x[output_ptr], #0x40]\n" - "str q29, [%x[output_ptr], #0x50]\n" - "cmp %x[N], #0x1c\n" - "add %x[output_ptr], %x[output_ptr], #0x60\n" - "blt 82f\n" - "str q30, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 84f\n" - "82:" // Width 7: Partial writeback - "tbz %x[N], #1, 83f\n" - "str d30, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 84f\n" - "st1 { v30.s }[2], [%x[output_ptr]]\n" - "b 84f\n" - "83:" // Width 7: Partial direct writeback: partial_1_24 - "tbz %x[N], #0, 84f\n" - "str s30, [%x[output_ptr], #0x0]\n" - "84:" // Width 7: Writeback done - "b 97f\n" - "85:" // Width 8 - "mov x20, %x[K]\n" - "mov x19, %x[A_ptr]\n" - "cbz x21, 86f\n" - "ldr q24, [x21, #0x0]\n" - "ldr q25, [x21, #0x10]\n" - "ldr q26, [x21, #0x20]\n" - "ldr q27, [x21, #0x30]\n" - "ldr q28, [x21, #0x40]\n" - "ldr q29, [x21, #0x50]\n" - "ldr q30, [x21, #0x60]\n" - "ldr q31, [x21, #0x70]\n" - "add x21, x21, #0x80\n" - "b 87f\n" - "86:" // Width 8: no bias - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "87:" // Width 8: setup done - "cmp x20, #0x4\n" - "blt 90f\n" - "cmp x20, #0x8\n" - "blt 89f\n" - "88:" // Width 8: Multiply loop: Main loop head - "ldr q0, [x19, #0x0]\n" - "ldr q1, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v1.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x10]\n" - "ldr q3, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v2.4s, v0.s[0]\n" - "ldr q4, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v3.4s, v0.s[0]\n" - "ldr q5, [%x[B_ptr], #0x40]\n" - "ldr q6, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v4.4s, v0.s[0]\n" - "ldr q7, [%x[B_ptr], #0x60]\n" - "ldr q8, [%x[B_ptr], #0x70]\n" - "fmla v28.4s, v5.4s, v0.s[0]\n" - "fmla v29.4s, v6.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v30.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q9, [%x[B_ptr], #0x0]\n" - "fmla v31.4s, v8.4s, v0.s[0]\n" - "ldr q10, [%x[B_ptr], #0x10]\n" - "ldr q11, [%x[B_ptr], #0x20]\n" - "fmla v24.4s, v9.4s, v0.s[1]\n" - "ldr q12, [%x[B_ptr], #0x30]\n" - "ldr q13, [%x[B_ptr], #0x40]\n" - "fmla v25.4s, v10.4s, v0.s[1]\n" - "fmla v26.4s, v11.4s, v0.s[1]\n" - "ldr q14, [%x[B_ptr], #0x50]\n" - "ldr q15, [%x[B_ptr], #0x60]\n" - "fmla v27.4s, v12.4s, v0.s[1]\n" - "ldr q16, [%x[B_ptr], #0x70]\n" - "fmla v28.4s, v13.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v29.4s, v14.4s, v0.s[1]\n" - "ldr q17, [%x[B_ptr], #0x0]\n" - "fmla v30.4s, v15.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q18, [%x[B_ptr], #0x10]\n" - "fmla v31.4s, v16.4s, v0.s[1]\n" - "ldr q19, [%x[B_ptr], #0x20]\n" - "ldr q20, [%x[B_ptr], #0x30]\n" - "fmla v24.4s, v17.4s, v0.s[2]\n" - "ldr q21, [%x[B_ptr], #0x40]\n" - "ldr q22, [%x[B_ptr], #0x50]\n" - "fmla v25.4s, v18.4s, v0.s[2]\n" - "ldr q23, [%x[B_ptr], #0x60]\n" - "fmla v26.4s, v19.4s, v0.s[2]\n" - "ldr q1, [%x[B_ptr], #0x70]\n" - "fmla v27.4s, v20.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v21.4s, v0.s[2]\n" - "ldr q2, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v22.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q3, [%x[B_ptr], #0x10]\n" - "fmla v30.4s, v23.4s, v0.s[2]\n" - "ldr q4, [%x[B_ptr], #0x20]\n" - "ldr q5, [%x[B_ptr], #0x30]\n" - "fmla v31.4s, v1.4s, v0.s[2]\n" - "ldr q6, [%x[B_ptr], #0x40]\n" - "fmla v24.4s, v2.4s, v0.s[3]\n" - "ldr q7, [%x[B_ptr], #0x50]\n" - "ldr q8, [%x[B_ptr], #0x60]\n" - "fmla v25.4s, v3.4s, v0.s[3]\n" - "ldr q9, [%x[B_ptr], #0x70]\n" - "fmla v26.4s, v4.4s, v0.s[3]\n" - "fmla v27.4s, v5.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v6.4s, v0.s[3]\n" - "add x19, x19, #0x10\n" - "fmla v29.4s, v7.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x20, x20, #0x4\n" - "fmla v30.4s, v8.4s, v0.s[3]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "cmp x20, #0x8\n" - "fmla v31.4s, v9.4s, v0.s[3]\n" - "bge 88b\n" - "89:" // Width 8: Multiply loop: Single iteration only - "sub x20, x20, #0x4\n" - "ldr q0, [x19, #0x0]\n" - "ldr q10, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v10.4s, v0.s[0]\n" - "ldr q11, [%x[B_ptr], #0x10]\n" - "ldr q12, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v11.4s, v0.s[0]\n" - "ldr q13, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v12.4s, v0.s[0]\n" - "ldr q14, [%x[B_ptr], #0x40]\n" - "ldr q15, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v13.4s, v0.s[0]\n" - "ldr q16, [%x[B_ptr], #0x60]\n" - "ldr q17, [%x[B_ptr], #0x70]\n" - "fmla v28.4s, v14.4s, v0.s[0]\n" - "fmla v29.4s, v15.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v30.4s, v16.4s, v0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q18, [%x[B_ptr], #0x0]\n" - "fmla v31.4s, v17.4s, v0.s[0]\n" - "ldr q19, [%x[B_ptr], #0x10]\n" - "ldr q20, [%x[B_ptr], #0x20]\n" - "fmla v24.4s, v18.4s, v0.s[1]\n" - "ldr q21, [%x[B_ptr], #0x30]\n" - "ldr q22, [%x[B_ptr], #0x40]\n" - "fmla v25.4s, v19.4s, v0.s[1]\n" - "fmla v26.4s, v20.4s, v0.s[1]\n" - "ldr q23, [%x[B_ptr], #0x50]\n" - "ldr q1, [%x[B_ptr], #0x60]\n" - "fmla v27.4s, v21.4s, v0.s[1]\n" - "ldr q2, [%x[B_ptr], #0x70]\n" - "fmla v28.4s, v22.4s, v0.s[1]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v29.4s, v23.4s, v0.s[1]\n" - "ldr q3, [%x[B_ptr], #0x0]\n" - "fmla v30.4s, v1.4s, v0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q4, [%x[B_ptr], #0x10]\n" - "fmla v31.4s, v2.4s, v0.s[1]\n" - "ldr q5, [%x[B_ptr], #0x20]\n" - "ldr q6, [%x[B_ptr], #0x30]\n" - "fmla v24.4s, v3.4s, v0.s[2]\n" - "ldr q7, [%x[B_ptr], #0x40]\n" - "ldr q8, [%x[B_ptr], #0x50]\n" - "fmla v25.4s, v4.4s, v0.s[2]\n" - "ldr q9, [%x[B_ptr], #0x60]\n" - "fmla v26.4s, v5.4s, v0.s[2]\n" - "ldr q10, [%x[B_ptr], #0x70]\n" - "fmla v27.4s, v6.4s, v0.s[2]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v7.4s, v0.s[2]\n" - "ldr q11, [%x[B_ptr], #0x0]\n" - "fmla v29.4s, v8.4s, v0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ldr q12, [%x[B_ptr], #0x10]\n" - "fmla v30.4s, v9.4s, v0.s[2]\n" - "ldr q13, [%x[B_ptr], #0x20]\n" - "ldr q14, [%x[B_ptr], #0x30]\n" - "fmla v31.4s, v10.4s, v0.s[2]\n" - "ldr q15, [%x[B_ptr], #0x40]\n" - "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr q16, [%x[B_ptr], #0x50]\n" - "ldr q17, [%x[B_ptr], #0x60]\n" - "fmla v25.4s, v12.4s, v0.s[3]\n" - "ldr q18, [%x[B_ptr], #0x70]\n" - "fmla v26.4s, v13.4s, v0.s[3]\n" - "fmla v27.4s, v14.4s, v0.s[3]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla v28.4s, v15.4s, v0.s[3]\n" - "add x19, x19, #0x10\n" - "fmla v29.4s, v16.4s, v0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla v30.4s, v17.4s, v0.s[3]\n" - "prfm pldl1keep, [x19, #0x80]\n" - "fmla v31.4s, v18.4s, v0.s[3]\n" - "90:" // Width 8: Multiply loop: Main loop skip - "cbz x20, 92f\n" - "91:" // Width 8: Multiply loop: Odd block loop - "ldr s0, [x19], #0x4\n" - "ldr q19, [%x[B_ptr], #0x0]\n" - "fmla v24.4s, v19.4s, v0.s[0]\n" - "ldr q20, [%x[B_ptr], #0x10]\n" - "ldr q21, [%x[B_ptr], #0x20]\n" - "fmla v25.4s, v20.4s, v0.s[0]\n" - "ldr q22, [%x[B_ptr], #0x30]\n" - "fmla v26.4s, v21.4s, v0.s[0]\n" - "ldr q23, [%x[B_ptr], #0x40]\n" - "ldr q1, [%x[B_ptr], #0x50]\n" - "fmla v27.4s, v22.4s, v0.s[0]\n" - "ldr q2, [%x[B_ptr], #0x60]\n" - "ldr q3, [%x[B_ptr], #0x70]\n" - "fmla v28.4s, v23.4s, v0.s[0]\n" - "fmla v29.4s, v1.4s, v0.s[0]\n" - "add %x[B_ptr], %x[B_ptr], #0x80\n" - "sub x20, x20, #0x1\n" - "fmla v30.4s, v2.4s, v0.s[0]\n" - "fmla v31.4s, v3.4s, v0.s[0]\n" - "cbnz x20, 91b\n" - "92:" // Width 8: Multiply loop: No odd multiplies - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 93f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x19]\n" - "fmin v24.4s, v24.4s, v16.4s\n" - "fmin v25.4s, v25.4s, v16.4s\n" - "fmin v26.4s, v26.4s, v16.4s\n" - "fmin v27.4s, v27.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmin v28.4s, v28.4s, v16.4s\n" - "fmin v29.4s, v29.4s, v16.4s\n" - "fmin v30.4s, v30.4s, v16.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" - "fmax v29.4s, v29.4s, v17.4s\n" - "fmax v30.4s, v30.4s, v17.4s\n" - "fmin v31.4s, v31.4s, v16.4s\n" - "fmax v31.4s, v31.4s, v17.4s\n" - "93:" // Width 8: No activation - "str q24, [%x[output_ptr], #0x0]\n" - "str q25, [%x[output_ptr], #0x10]\n" - "str q26, [%x[output_ptr], #0x20]\n" - "str q27, [%x[output_ptr], #0x30]\n" - "str q28, [%x[output_ptr], #0x40]\n" - "str q29, [%x[output_ptr], #0x50]\n" - "str q30, [%x[output_ptr], #0x60]\n" - "cmp %x[N], #0x20\n" - "add %x[output_ptr], %x[output_ptr], #0x70\n" - "blt 94f\n" - "str q31, [%x[output_ptr], #0x0]\n" - "add %x[output_ptr], %x[output_ptr], #0x10\n" - "b 96f\n" - "94:" // Width 8: Partial writeback - "tbz %x[N], #1, 95f\n" - "str d31, [%x[output_ptr]], #0x8\n" - "tbz %x[N], #0, 96f\n" - "st1 { v31.s }[2], [%x[output_ptr]]\n" - "b 96f\n" - "95:" // Width 8: Partial direct writeback: partial_1_28 - "tbz %x[N], #0, 96f\n" - "str s31, [%x[output_ptr], #0x0]\n" - "96:" // Width 8: Writeback done - "subs x22, x22, #0x8\n" - "sub %x[N], %x[N], #0x20\n" - "bgt 1b\n" - "97:" // Exit - - : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr) - : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm - -#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp index cccedc6b9c..586d6a64a4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,6 +62,7 @@ public: // Use the standard fixed size transforms. StdTransformsFixed transforms = {}; + template static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { switch (ci->get_cpu_model()) { case CPUModel::A55r1: diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp index 29cdd33893..e5728beba8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp @@ -71,10 +71,6 @@ void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp register float16x8_t b2 asm("v6"); __asm __volatile ( - // Enable FP16 instruction support (but only if it's not already on). -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ".arch armv8.2-a+fp16\n" -#endif // Initialize result registers, load initial operands, prime prefetches. "movi v8.8h, #0x0\n" "ldr %d[a0], [%[a_ptr]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp index c9c48dd1c0..23b87fa192 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp @@ -66,10 +66,6 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp register float16x8_t b2a asm("v7"); __asm __volatile ( - // Enable FP16 instruction support (but only if it's not already on). -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ".arch armv8.2-a+fp16\n" -#endif // Initialize result registers, load initial operands, prime prefetches. "movi v8.8h, #0x0\n" "ldr %q[a0], [%[a_ptr]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp index a6d2405e7e..b47fa6a2d7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp @@ -63,10 +63,6 @@ void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 register float16x8_t b2 asm("v4"); __asm __volatile ( - // Enable FP16 instruction support (but only if it's not already on). -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ".arch armv8.2-a+fp16\n" -#endif // Initialize result registers, load initial operands, prime prefetches. "movi v8.8h, #0x0\n" "ldr %q[a0], [%[a_ptr]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp index fca96f6028..3b8770e153 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp @@ -22,10 +22,11 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../bfloat.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -43,7 +44,8 @@ void a64_hybrid_bf16fp32_dot_6x16( ARGLIST ); class cls_a64_hybrid_bf16fp32_dot_6x16 { public: - typedef bfloat16 operand_type; + typedef bfloat16 lhs_operand_type; + typedef bfloat16 rhs_operand_type; typedef float result_type; typedef void (*kern_type)( ARGLIST ); @@ -69,7 +71,23 @@ public: return true; } - StdTransformsFixed transforms = {}; + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 15.83 }; + case CPUModel::A510: + return { 7.28 }; + case CPUModel::V1: + return { 27.34 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=a64_hybrid_bf16fp32_dot_6x16; @@ -81,4 +99,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp index afb06dedea..27e08135b6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp @@ -1988,8 +1988,8 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ld1 { v22.4s }, [x22], #0x10\n" "ld1 { v26.4s }, [x21], #0x10\n" "tbz x11, #1, 144f\n" - "mov x19, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x19, #0x38\n" "ldr d15, [x24], #0x8\n" "ldr d19, [x23], #0x8\n" "ldr d23, [x22], #0x8\n" @@ -2042,8 +2042,8 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ld1 { v20.4s }, [x22], #0x10\n" "ld1 { v24.4s }, [x21], #0x10\n" "tbz x11, #1, 148f\n" - "mov x19, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x19, #0x18\n" "ldr d13, [x24], #0x8\n" "ldr d17, [x23], #0x8\n" "ldr d21, [x22], #0x8\n" @@ -2717,12 +2717,12 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ld1 { v16.4s }, [x23], #0x10\n" "ld1 { v20.4s }, [x22], #0x10\n" "ld1 { v24.4s }, [x21], #0x10\n" - "ld1 { v28.4s }, [x20], #0x10\n" "ld1 { v9.4s }, [x28], #0x10\n" "ld1 { v13.4s }, [x24], #0x10\n" "ld1 { v17.4s }, [x23], #0x10\n" "ld1 { v21.4s }, [x22], #0x10\n" "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" "ld1 { v29.4s }, [x20], #0x10\n" "tbz x11, #2, 180f\n" "ld1 { v10.4s }, [x28], #0x10\n" @@ -2732,8 +2732,8 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ld1 { v26.4s }, [x21], #0x10\n" "ld1 { v30.4s }, [x20], #0x10\n" "tbz x11, #1, 179f\n" - "mov x19, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x19, #0x38\n" "ldr d15, [x24], #0x8\n" "ldr d19, [x23], #0x8\n" "ldr d23, [x22], #0x8\n" @@ -2793,8 +2793,8 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ld1 { v24.4s }, [x21], #0x10\n" "ld1 { v28.4s }, [x20], #0x10\n" "tbz x11, #1, 183f\n" - "mov x19, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x19, #0x18\n" "ldr d13, [x24], #0x8\n" "ldr d17, [x23], #0x8\n" "ldr d21, [x22], #0x8\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp new file mode 100644 index 0000000000..8cb743b777 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_bf16fp32_mmla_6x16( ARGLIST ); + +class cls_a64_hybrid_bf16fp32_mmla_6x16 +{ +public: + typedef bfloat16 lhs_operand_type; + typedef bfloat16 rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 25.04 }; + case CPUModel::A510: + return { 7.27 }; + case CPUModel::V1: + return { 40.09 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_bf16fp32_mmla_6x16; + cls_a64_hybrid_bf16fp32_mmla_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp new file mode 100644 index 0000000000..0fa358e848 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp @@ -0,0 +1,3725 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_bf16fp32_mmla_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 186f\n" + "cmp %x[M], #0x4\n" + "bgt 149f\n" + "beq 112f\n" + "cmp %x[M], #0x2\n" + "bgt 75f\n" + "beq 38f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x9, 3f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "add x9, x9, #0x40\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "b 15f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 14f\n" + "cmp x11, #0x10\n" + "bge 12f\n" + "tbz x11, #3, 7f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #2, 5f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "tbz x11, #1, 4f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v16.s }[2], [x28]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 11f\n" + "ldr s16, [x28, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 6f\n" + "ldr d11, [x28], #0x8\n" + "mov x19, #0x28\n" + "tbz x11, #0, 11f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 11f\n" + "ldr s11, [x28, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 9f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #1, 8f\n" + "ldr d10, [x28], #0x8\n" + "mov x19, #0x18\n" + "tbz x11, #0, 11f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 11f\n" + "ldr s10, [x28, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 10f\n" + "ldr d9, [x28], #0x8\n" + "mov x19, #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "11:" // Height 1: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 13f\n" + "12:" // Height 1: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "13:" // Height 1: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 15f\n" + "14:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "15:" // Height 1: setup done + "mov x27, #0x0\n" + "16:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 18f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "b 18f\n" + "17:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "18:" // Height 1: input setup done + "cmp x26, #0x8\n" + "blt 21f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x10\n" + "blt 20f\n" + "19:" // Height 1: Multiply loop: Main loop head + "movi v2.16b, #0x0\n" + "ldr q7, [x10, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x25, #0x0]\n" + "bge 19b\n" + "20:" // Height 1: Multiply loop: Single iteration only + "movi v2.16b, #0x0\n" + "ldr q7, [x10, #0x0]\n" + "sub x26, x26, #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "21:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 26f\n" + "cmp x26, #0x4\n" + "blt 23f\n" + "22:" // Height 1: Multiply loop: Odd block loop + "movi v2.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x4\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x10, #0x0]\n" + "cmp x26, #0x4\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q7, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "bge 22b\n" + "cbz x26, 26f\n" + "23:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #1, 24f\n" + "ldr s1, [x25], #0x4\n" + "tbz x26, #0, 25f\n" + "ld1 { v1.h }[2], [x25]\n" + "b 25f\n" + "24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x25, #0x0]\n" + "25:" // Height 1: Multiply loop: Ragged operand read: Done + "movi v2.16b, #0x0\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "26:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 16b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "27:" // Height 1: No activation + "uzp1 v8.2d, v8.2d, v12.2d\n" + "cmp x11, #0x10\n" + "uzp1 v9.2d, v9.2d, v13.2d\n" + "uzp1 v10.2d, v10.2d, v14.2d\n" + "uzp1 v11.2d, v11.2d, v15.2d\n" + "bge 36f\n" + "tbz x11, #3, 31f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #2, 29f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #1, 28f\n" + "str d11, [x28], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v11.s }[2], [x28]\n" + "b 35f\n" + "28:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 35f\n" + "str s11, [x28, #0x0]\n" + "b 35f\n" + "29:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 30f\n" + "str d10, [x28], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v10.s }[2], [x28]\n" + "b 35f\n" + "30:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 35f\n" + "str s10, [x28, #0x0]\n" + "b 35f\n" + "31:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 33f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x11, #1, 32f\n" + "str d9, [x28], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v9.s }[2], [x28]\n" + "b 35f\n" + "32:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 35f\n" + "str s9, [x28, #0x0]\n" + "b 35f\n" + "33:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 34f\n" + "str d8, [x28], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v8.s }[2], [x28]\n" + "b 35f\n" + "34:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "35:" // Height 1: Partial direct writeback: Done + "b 37f\n" + "36:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "37:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 224f\n" + "38:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "39:" // Height 2: Column loop + "cbz x9, 40f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "add x9, x9, #0x40\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "b 52f\n" + "40:" // Height 2: no bias + "tbz %x[flags], #0, 51f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "bge 49f\n" + "tbz x11, #3, 44f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "tbz x11, #2, 42f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "tbz x11, #1, 41f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "tbz x11, #0, 48f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "b 48f\n" + "41:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 48f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "b 48f\n" + "42:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 43f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "tbz x11, #0, 48f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "b 48f\n" + "43:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 48f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "b 48f\n" + "44:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 46f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "tbz x11, #1, 45f\n" + "mov x19, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "tbz x11, #0, 48f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "b 48f\n" + "45:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 48f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "b 48f\n" + "46:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 47f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "tbz x11, #0, 48f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "b 48f\n" + "47:" // Height 2: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "48:" // Height 2: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 50f\n" + "49:" // Height 2: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "50:" // Height 2: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 52f\n" + "51:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "52:" // Height 2: setup done + "mov x27, #0x0\n" + "53:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 54f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 55f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 55f\n" + "54:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "55:" // Height 2: input setup done + "cmp x26, #0x8\n" + "blt 58f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "blt 57f\n" + "56:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x24, x24, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + "sub x26, x26, #0x8\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "cmp x26, #0x10\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x25, #0x0]\n" + "bge 56b\n" + "57:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x10, #0x0]\n" + "sub x26, x26, #0x8\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x25, x25, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + "add x24, x24, #0x10\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "58:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 63f\n" + "cmp x26, #0x4\n" + "blt 60f\n" + "59:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x4\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x10, #0x0]\n" + "cmp x26, #0x4\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q7, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "bge 59b\n" + "cbz x26, 63f\n" + "60:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #1, 61f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "tbz x26, #0, 62f\n" + "ld1 { v1.h }[2], [x25]\n" + "ld1 { v2.h }[2], [x24]\n" + "b 62f\n" + "61:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x25, #0x0]\n" + "ldr h2, [x24, #0x0]\n" + "62:" // Height 2: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "63:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 53b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 64f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "64:" // Height 2: No activation + "uzp1 v7.2d, v8.2d, v12.2d\n" + "cmp x11, #0x10\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "bge 73f\n" + "tbz x11, #3, 68f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "tbz x11, #2, 66f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "tbz x11, #1, 65f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "b 72f\n" + "65:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 72f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "b 72f\n" + "66:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 67f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "b 72f\n" + "67:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 72f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "b 72f\n" + "68:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 70f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "tbz x11, #1, 69f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "b 72f\n" + "69:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 72f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "b 72f\n" + "70:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 71f\n" + "str d7, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "b 72f\n" + "71:" // Height 2: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "72:" // Height 2: Partial direct writeback: Done + "b 74f\n" + "73:" // Height 2: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "74:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 39b\n" + "b 224f\n" + "75:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "76:" // Height 3: Column loop + "cbz x9, 77f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "b 89f\n" + "77:" // Height 3: no bias + "tbz %x[flags], #0, 88f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "bge 86f\n" + "tbz x11, #3, 81f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #2, 79f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "tbz x11, #1, 78f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "b 85f\n" + "78:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 85f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "b 85f\n" + "79:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 80f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "b 85f\n" + "80:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 85f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "b 85f\n" + "81:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 83f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #1, 82f\n" + "mov x19, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "b 85f\n" + "82:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 85f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "b 85f\n" + "83:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 84f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "b 85f\n" + "84:" // Height 3: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "85:" // Height 3: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 87f\n" + "86:" // Height 3: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "87:" // Height 3: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 89f\n" + "88:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "89:" // Height 3: setup done + "mov x27, #0x0\n" + "90:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 91f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 92f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 92f\n" + "91:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "92:" // Height 3: input setup done + "cmp x26, #0x8\n" + "blt 95f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x10\n" + "blt 94f\n" + "93:" // Height 3: Multiply loop: Main loop head + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "bge 93b\n" + "94:" // Height 3: Multiply loop: Single iteration only + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "95:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 100f\n" + "cmp x26, #0x4\n" + "blt 97f\n" + "96:" // Height 3: Multiply loop: Odd block loop + "movi v4.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x4\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x4\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "bge 96b\n" + "cbz x26, 100f\n" + "97:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #1, 98f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "tbz x26, #0, 99f\n" + "ld1 { v1.h }[2], [x25]\n" + "ld1 { v2.h }[2], [x24]\n" + "ld1 { v3.h }[2], [x23]\n" + "b 99f\n" + "98:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x25, #0x0]\n" + "ldr h2, [x24, #0x0]\n" + "ldr h3, [x23, #0x0]\n" + "99:" // Height 3: Multiply loop: Ragged operand read: Done + "movi v4.16b, #0x0\n" + "ldr q7, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "100:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 90b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 101f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "101:" // Height 3: No activation + "uzp1 v7.2d, v8.2d, v12.2d\n" + "cmp x11, #0x10\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "bge 110f\n" + "tbz x11, #3, 105f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #2, 103f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #1, 102f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "b 109f\n" + "102:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 109f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "b 109f\n" + "103:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 104f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "b 109f\n" + "104:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 109f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "b 109f\n" + "105:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 107f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "tbz x11, #1, 106f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "b 109f\n" + "106:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 109f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "b 109f\n" + "107:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 108f\n" + "str d7, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "b 109f\n" + "108:" // Height 3: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "109:" // Height 3: Partial direct writeback: Done + "b 111f\n" + "110:" // Height 3: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "111:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 76b\n" + "b 224f\n" + "112:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "113:" // Height 4: Column loop + "cbz x9, 114f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "b 126f\n" + "114:" // Height 4: no bias + "tbz %x[flags], #0, 125f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 123f\n" + "tbz x11, #3, 118f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "tbz x11, #2, 116f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "tbz x11, #1, 115f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "b 122f\n" + "115:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 122f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "b 122f\n" + "116:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 117f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "b 122f\n" + "117:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 122f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "b 122f\n" + "118:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 120f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "tbz x11, #1, 119f\n" + "mov x19, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "b 122f\n" + "119:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 122f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "b 122f\n" + "120:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 121f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "b 122f\n" + "121:" // Height 4: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "122:" // Height 4: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 124f\n" + "123:" // Height 4: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "124:" // Height 4: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 126f\n" + "125:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "126:" // Height 4: setup done + "mov x27, #0x0\n" + "127:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 128f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 129f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 129f\n" + "128:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "129:" // Height 4: input setup done + "cmp x26, #0x8\n" + "blt 132f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "blt 131f\n" + "130:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x8\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + "cmp x26, #0x10\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "bge 130b\n" + "131:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x8\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x23, x23, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "132:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 137f\n" + "cmp x26, #0x4\n" + "blt 134f\n" + "133:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x4\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x4\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "bge 133b\n" + "cbz x26, 137f\n" + "134:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #1, 135f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "tbz x26, #0, 136f\n" + "ld1 { v1.h }[2], [x25]\n" + "ld1 { v2.h }[2], [x24]\n" + "ld1 { v3.h }[2], [x23]\n" + "ld1 { v4.h }[2], [x22]\n" + "b 136f\n" + "135:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x25, #0x0]\n" + "ldr h2, [x24, #0x0]\n" + "ldr h3, [x23, #0x0]\n" + "ldr h4, [x22, #0x0]\n" + "136:" // Height 4: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x10, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "137:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 127b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 138f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "138:" // Height 4: No activation + "uzp1 v7.2d, v8.2d, v12.2d\n" + "cmp x11, #0x10\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "bge 147f\n" + "tbz x11, #3, 142f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x11, #2, 140f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x11, #1, 139f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 146f\n" + "139:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 146f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 146f\n" + "140:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 141f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 146f\n" + "141:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 146f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 146f\n" + "142:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 144f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x11, #1, 143f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 146f\n" + "143:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 146f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 146f\n" + "144:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 145f\n" + "str d7, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d15, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 146f\n" + "145:" // Height 4: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "146:" // Height 4: Partial direct writeback: Done + "b 148f\n" + "147:" // Height 4: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q15, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "str q21, [x23, #0x20]\n" + "str q22, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "148:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 113b\n" + "b 224f\n" + "149:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "150:" // Height 5: Column loop + "cbz x9, 151f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "mov v24.16b, v8.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "mov v28.16b, v12.16b\n" + "mov v25.16b, v9.16b\n" + "mov v29.16b, v13.16b\n" + "mov v26.16b, v10.16b\n" + "mov v30.16b, v14.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v15.16b\n" + "b 163f\n" + "151:" // Height 5: no bias + "tbz %x[flags], #0, 162f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 160f\n" + "tbz x11, #3, 155f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #2, 153f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v27.4s }, [x21], #0x10\n" + "tbz x11, #1, 152f\n" + "ldr d16, [x28], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d6, [x21], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v6.s }[2], [x21]\n" + "b 159f\n" + "152:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 159f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s6, [x21, #0x0]\n" + "b 159f\n" + "153:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 154f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "b 159f\n" + "154:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 159f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "b 159f\n" + "155:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 157f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #1, 156f\n" + "ldr d10, [x28], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "b 159f\n" + "156:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 159f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "b 159f\n" + "157:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 158f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "b 159f\n" + "158:" // Height 5: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "159:" // Height 5: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 161f\n" + "160:" // Height 5: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q25, [x21, #0x0]\n" + "ldr q26, [x21, #0x10]\n" + "ldr q27, [x21, #0x20]\n" + "ldr q6, [x21, #0x30]\n" + "161:" // Height 5: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 163f\n" + "162:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "163:" // Height 5: setup done + "mov x27, #0x0\n" + "164:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 165f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 166f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "b 166f\n" + "165:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "166:" // Height 5: input setup done + "cmp x26, #0x8\n" + "blt 169f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x10\n" + "blt 168f\n" + "167:" // Height 5: Multiply loop: Main loop head + "movi v6.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x21, x21, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x8\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + "cmp x26, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "bge 167b\n" + "168:" // Height 5: Multiply loop: Single iteration only + "movi v6.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "169:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 174f\n" + "cmp x26, #0x4\n" + "blt 171f\n" + "170:" // Height 5: Multiply loop: Odd block loop + "movi v7.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x4\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x4\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "bge 170b\n" + "cbz x26, 174f\n" + "171:" // Height 5: Multiply loop: Skip odd blocks + "tbz x26, #1, 172f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "tbz x26, #0, 173f\n" + "ld1 { v1.h }[2], [x25]\n" + "ld1 { v2.h }[2], [x24]\n" + "ld1 { v3.h }[2], [x23]\n" + "ld1 { v4.h }[2], [x22]\n" + "ld1 { v5.h }[2], [x21]\n" + "b 173f\n" + "172:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x25, #0x0]\n" + "ldr h2, [x24, #0x0]\n" + "ldr h3, [x23, #0x0]\n" + "ldr h4, [x22, #0x0]\n" + "ldr h5, [x21, #0x0]\n" + "173:" // Height 5: Multiply loop: Ragged operand read: Done + "movi v6.16b, #0x0\n" + "ldr q7, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "174:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 164b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 175f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "175:" // Height 5: No activation + "uzp1 v7.2d, v8.2d, v12.2d\n" + "cmp x11, #0x10\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "bge 184f\n" + "tbz x11, #3, 179f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #2, 177f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #1, 176f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "b 183f\n" + "176:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 183f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "b 183f\n" + "177:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 178f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "b 183f\n" + "178:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 183f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "b 183f\n" + "179:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 181f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "tbz x11, #1, 180f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "b 183f\n" + "180:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 183f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "b 183f\n" + "181:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 182f\n" + "str d7, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d15, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "b 183f\n" + "182:" // Height 5: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "183:" // Height 5: Partial direct writeback: Done + "b 185f\n" + "184:" // Height 5: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q15, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "str q21, [x23, #0x20]\n" + "str q22, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "185:" // Height 5: Writeback done + "subs x11, x11, #0x10\n" + "bgt 150b\n" + "b 224f\n" + "186:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "187:" // Height 6: Column loop + "cbz x9, 188f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "mov v24.16b, v8.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "mov v28.16b, v12.16b\n" + "mov v25.16b, v9.16b\n" + "mov v29.16b, v13.16b\n" + "mov v26.16b, v10.16b\n" + "mov v30.16b, v14.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v15.16b\n" + "b 200f\n" + "188:" // Height 6: no bias + "tbz %x[flags], #0, 199f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 197f\n" + "tbz x11, #3, 192f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "ld1 { v29.4s }, [x20], #0x10\n" + "tbz x11, #2, 190f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v27.4s }, [x21], #0x10\n" + "ld1 { v30.4s }, [x20], #0x10\n" + "tbz x11, #1, 189f\n" + "ldr d16, [x28], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d6, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v6.s }[2], [x21]\n" + "ld1 { v31.s }[2], [x20]\n" + "b 196f\n" + "189:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 196f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s6, [x21, #0x0]\n" + "ldr s31, [x20, #0x0]\n" + "b 196f\n" + "190:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 191f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "ldr d30, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "ld1 { v30.s }[2], [x20]\n" + "b 196f\n" + "191:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 196f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "ldr s30, [x20, #0x0]\n" + "b 196f\n" + "192:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 194f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "tbz x11, #1, 193f\n" + "ldr d10, [x28], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "ldr d29, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "ld1 { v29.s }[2], [x20]\n" + "b 196f\n" + "193:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 196f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "ldr s29, [x20, #0x0]\n" + "b 196f\n" + "194:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 195f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "ldr d28, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "ld1 { v28.s }[2], [x20]\n" + "b 196f\n" + "195:" // Height 6: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "ldr s28, [x20, #0x0]\n" + "196:" // Height 6: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 198f\n" + "197:" // Height 6: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q25, [x21, #0x0]\n" + "ldr q26, [x21, #0x10]\n" + "ldr q27, [x21, #0x20]\n" + "ldr q6, [x21, #0x30]\n" + "ldr q28, [x20, #0x0]\n" + "ldr q29, [x20, #0x10]\n" + "ldr q30, [x20, #0x20]\n" + "ldr q31, [x20, #0x30]\n" + "198:" // Height 6: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 200f\n" + "199:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "200:" // Height 6: setup done + "mov x27, #0x0\n" + "201:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 202f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 203f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 203f\n" + "202:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "203:" // Height 6: input setup done + "cmp x26, #0x8\n" + "blt 206f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "blt 205f\n" + "204:" // Height 6: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x20, x20, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x8\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "bge 204b\n" + "205:" // Height 6: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x8\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x10, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x21, x21, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x20, x20, #0x10\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "206:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 211f\n" + "cmp x26, #0x4\n" + "blt 208f\n" + "207:" // Height 6: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x4\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x4\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr d7, [x20], #0x8\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "bge 207b\n" + "cbz x26, 211f\n" + "208:" // Height 6: Multiply loop: Skip odd blocks + "tbz x26, #1, 209f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "ldr s6, [x20], #0x4\n" + "tbz x26, #0, 210f\n" + "ld1 { v1.h }[2], [x25]\n" + "ld1 { v2.h }[2], [x24]\n" + "ld1 { v3.h }[2], [x23]\n" + "ld1 { v4.h }[2], [x22]\n" + "ld1 { v5.h }[2], [x21]\n" + "ld1 { v6.h }[2], [x20]\n" + "b 210f\n" + "209:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x25, #0x0]\n" + "ldr h2, [x24, #0x0]\n" + "ldr h3, [x23, #0x0]\n" + "ldr h4, [x22, #0x0]\n" + "ldr h5, [x21, #0x0]\n" + "ldr h6, [x20, #0x0]\n" + "210:" // Height 6: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x10, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "211:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 201b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbz %x[flags], #1, 212f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "212:" // Height 6: No activation + "uzp1 v7.2d, v8.2d, v12.2d\n" + "cmp x11, #0x10\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "bge 221f\n" + "tbz x11, #3, 216f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v23.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x11, #2, 214f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x11, #1, 213f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 220f\n" + "213:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 220f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s30, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 220f\n" + "214:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 215f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d29, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v29.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 220f\n" + "215:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 220f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s29, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 220f\n" + "216:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 218f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v23.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x11, #1, 217f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v28.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 220f\n" + "217:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 220f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s28, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 220f\n" + "218:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 219f\n" + "str d7, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d15, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 220f\n" + "219:" // Height 6: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "220:" // Height 6: Partial direct writeback: Done + "b 222f\n" + "221:" // Height 6: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q15, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "str q21, [x23, #0x20]\n" + "str q22, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q23, [x21, #0x0]\n" + "str q28, [x21, #0x10]\n" + "str q29, [x21, #0x20]\n" + "str q30, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "222:" // Height 6: Writeback done + "subs x11, x11, #0x10\n" + "bgt 187b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 224f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 223f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "223:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "224:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp index 674d71d626..4dd7556acd 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -44,7 +44,8 @@ void a64_hybrid_fp16_mla_6x32_a55( ARGLIST ); class cls_a64_hybrid_fp16_mla_6x32 { public: - typedef __fp16 operand_type; + typedef __fp16 lhs_operand_type; + typedef __fp16 rhs_operand_type; typedef __fp16 result_type; typedef void (*kern_type)( ARGLIST ); @@ -70,16 +71,24 @@ public: return true; } - StdTransformsFixed transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 6.94 }; - default: - return { 14.53 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 5.22 }; + default: + return { 14.53 }; + case CPUModel::A510: + return { 8.94 }; + case CPUModel::V1: + return { 29.26 }; + } } + + return { 1.0 }; } // Default to the generic kernel @@ -99,4 +108,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp index 87c73740e7..9157d29eba 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp @@ -92,9 +92,6 @@ void a64_hybrid_fp16_mla_6x32_a55 ( break; } __asm__ __volatile__( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ".arch armv8.2-a+fp16\n" -#endif "1:" // Row loop "cmp %x[M], #0x6\n" "bge 246f\n" @@ -1305,14 +1302,14 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" - "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" - "mov v14.16b, v10.16b\n" - "mov v18.16b, v10.16b\n" "ldr q11, [x16, #0x30]\n" + "mov v13.16b, v9.16b\n" "add x16, x16, #0x40\n" + "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" + "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" "b 120f\n" "101:" // Height 3: no bias @@ -2158,18 +2155,18 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" + "add x16, x16, #0x40\n" "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" "mov v20.16b, v8.16b\n" "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" - "ldr q11, [x16, #0x30]\n" - "add x16, x16, #0x40\n" - "mov v15.16b, v11.16b\n" - "mov v19.16b, v11.16b\n" "mov v23.16b, v11.16b\n" "b 169f\n" "150:" // Height 4: no bias @@ -3182,22 +3179,22 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" + "add x16, x16, #0x40\n" "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" "mov v20.16b, v8.16b\n" "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" "mov v24.16b, v8.16b\n" "mov v25.16b, v9.16b\n" "mov v26.16b, v10.16b\n" - "ldr q11, [x16, #0x30]\n" - "add x16, x16, #0x40\n" - "mov v15.16b, v11.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v11.16b\n" "mov v27.16b, v11.16b\n" "b 218f\n" "199:" // Height 5: no bias @@ -4380,26 +4377,26 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" + "add x16, x16, #0x40\n" "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" "mov v20.16b, v8.16b\n" "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" "mov v24.16b, v8.16b\n" "mov v25.16b, v9.16b\n" "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" "mov v28.16b, v8.16b\n" "mov v29.16b, v9.16b\n" "mov v30.16b, v10.16b\n" - "ldr q11, [x16, #0x30]\n" - "add x16, x16, #0x40\n" - "mov v15.16b, v11.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v11.16b\n" - "mov v27.16b, v11.16b\n" "mov v31.16b, v11.16b\n" "b 267f\n" "248:" // Height 6: no bias diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp index 6e51773166..8877306f40 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp @@ -92,9 +92,6 @@ void a64_hybrid_fp16_mla_6x32 ( break; } __asm__ __volatile__( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ".arch armv8.2-a+fp16\n" -#endif "1:" // Row loop "cmp %x[M], #0x6\n" "bge 246f\n" @@ -4068,12 +4065,12 @@ void a64_hybrid_fp16_mla_6x32 ( "ld1 { v16.8h }, [x23], #0x10\n" "ld1 { v20.8h }, [x22], #0x10\n" "ld1 { v24.8h }, [x21], #0x10\n" - "ld1 { v28.8h }, [x20], #0x10\n" "ld1 { v9.8h }, [x28], #0x10\n" "ld1 { v13.8h }, [x24], #0x10\n" "ld1 { v17.8h }, [x23], #0x10\n" "ld1 { v21.8h }, [x22], #0x10\n" "ld1 { v25.8h }, [x21], #0x10\n" + "ld1 { v28.8h }, [x20], #0x10\n" "ld1 { v29.8h }, [x20], #0x10\n" "tbz x11, #3, 252f\n" "ld1 { v10.8h }, [x28], #0x10\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp new file mode 100644 index 0000000000..d68e4a22b5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const float *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_fp32_mla_4x24( ARGLIST ); +void a64_hybrid_fp32_mla_4x24_a55( ARGLIST ); + +class cls_a64_hybrid_fp32_mla_4x24 +{ +public: + typedef float lhs_operand_type; + typedef float rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 24; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 2.985 }; + case CPUModel::A53: + return { 1.43 }; + case CPUModel::A73: + return { 2.56 }; + case CPUModel::A510: + return { 3.51 }; + case CPUModel::V1: + return { 14.38 }; + default: + return { 6.614 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_fp32_mla_4x24; + cls_a64_hybrid_fp32_mla_4x24(const CPUInfo *ci) + { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + case CPUModel::A53: + kernel=a64_hybrid_fp32_mla_4x24_a55; + break; + } + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp new file mode 100644 index 0000000000..1fbc9232f0 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp @@ -0,0 +1,2807 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_4x24_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 124f\n" + "cmp %x[M], #0x2\n" + "bgt 83f\n" + "beq 42f\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[bias]\n" + "mov x13, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x14, 3f\n" + "ldr q8, [x14, #0x0]\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "ldr q11, [x14, #0x30]\n" + "ldr q12, [x14, #0x40]\n" + "ldr q13, [x14, #0x50]\n" + "add x14, x14, #0x60\n" + "b 18f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 17f\n" + "cmp x16, #0x18\n" + "bge 16f\n" + "tbz x16, #4, 7f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v11.4s }, [x13], #0x10\n" + "tbz x16, #2, 5f\n" + "ld1 { v12.4s }, [x13], #0x10\n" + "tbz x16, #1, 4f\n" + "mov x19, #0x58\n" + "ldr d13, [x13], #0x8\n" + "tbz x16, #0, 15f\n" + "ld1 { v13.s }[2], [x13]\n" + "b 15f\n" + "4:" // Height 1: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x16, #0, 15f\n" + "ldr s13, [x13, #0x0]\n" + "b 15f\n" + "5:" // Height 1: Partial accumulate: partial_2_16 + "tbz x16, #1, 6f\n" + "ldr d12, [x13], #0x8\n" + "mov x19, #0x48\n" + "tbz x16, #0, 15f\n" + "ld1 { v12.s }[2], [x13]\n" + "b 15f\n" + "6:" // Height 1: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x16, #0, 15f\n" + "ldr s12, [x13, #0x0]\n" + "b 15f\n" + "7:" // Height 1: Partial accumulate: partial_8_0 + "tbz x16, #3, 11f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "tbz x16, #2, 9f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "tbz x16, #1, 8f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "tbz x16, #0, 15f\n" + "ld1 { v11.s }[2], [x13]\n" + "b 15f\n" + "8:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 15f\n" + "ldr s11, [x13, #0x0]\n" + "b 15f\n" + "9:" // Height 1: Partial accumulate: partial_2_8 + "tbz x16, #1, 10f\n" + "ldr d10, [x13], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 15f\n" + "ld1 { v10.s }[2], [x13]\n" + "b 15f\n" + "10:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 15f\n" + "ldr s10, [x13, #0x0]\n" + "b 15f\n" + "11:" // Height 1: Partial accumulate: partial_4_0 + "tbz x16, #2, 13f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "tbz x16, #1, 12f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "tbz x16, #0, 15f\n" + "ld1 { v9.s }[2], [x13]\n" + "b 15f\n" + "12:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 15f\n" + "ldr s9, [x13, #0x0]\n" + "b 15f\n" + "13:" // Height 1: Partial accumulate: partial_2_0 + "tbz x16, #1, 14f\n" + "ldr d8, [x13], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 15f\n" + "ld1 { v8.s }[2], [x13]\n" + "b 15f\n" + "14:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x13, #0x0]\n" + "mov x19, #0x0\n" + "15:" // Height 1: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 18f\n" + "16:" // Height 1: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x13, #0x40]\n" + "ldr q13, [x13, #0x50]\n" + "b 18f\n" + "17:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "18:" // Height 1: setup done + "mov x12, #0x0\n" + "19:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 21f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "b 21f\n" + "20:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "21:" // Height 1: input setup done + "cmp x11, #0x4\n" + "blt 24f\n" + "ldr q0, [x10, #0x0]\n" + "ldr q4, [x15, #0x0]\n" + "cmp x11, #0x8\n" + "blt 23f\n" + "22:" // Height 1: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr d5, [x15, #0x10]\n" + "ldr x9, [x15, #0x18]\n" + "add x10, x10, #0x10\n" + "ldr d6, [x15, #0x20]\n" + "sub x11, x11, #0x4\n" + "ldr x28, [x15, #0x28]\n" + "cmp x11, #0x8\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0x30]\n" + "ldr x27, [x15, #0x38]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "mov v6.d[1], x28\n" + "ldr d4, [x15, #0x40]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "mov v7.d[1], x27\n" + "ldr x26, [x15, #0x48]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr d5, [x15, #0x50]\n" + "ldr x9, [x15, #0x58]\n" + "mov v4.d[1], x26\n" + "ldr d6, [x15, #0x60]\n" + "ldr x28, [x15, #0x68]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0x70]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "mov v6.d[1], x28\n" + "ldr x27, [x15, #0x78]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr d4, [x15, #0x80]\n" + "ldr x26, [x15, #0x88]\n" + "mov v7.d[1], x27\n" + "ldr d5, [x15, #0x90]\n" + "ldr x9, [x15, #0x98]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "mov v4.d[1], x26\n" + "ldr d6, [x15, #0xa0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "mov v5.d[1], x9\n" + "ldr x28, [x15, #0xa8]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ldr d7, [x15, #0xb0]\n" + "ldr x27, [x15, #0xb8]\n" + "mov v6.d[1], x28\n" + "ldr d4, [x15, #0xc0]\n" + "ldr x26, [x15, #0xc8]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "mov v7.d[1], x27\n" + "ldr d5, [x15, #0xd0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "mov v4.d[1], x26\n" + "ldr x9, [x15, #0xd8]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "ldr d6, [x15, #0xe0]\n" + "ldr x28, [x15, #0xe8]\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0xf0]\n" + "ldr x27, [x15, #0xf8]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "mov v6.d[1], x28\n" + "ldr d4, [x15, #0x100]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "mov v7.d[1], x27\n" + "ldr x26, [x15, #0x108]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr d5, [x15, #0x110]\n" + "ldr x9, [x15, #0x118]\n" + "mov v4.d[1], x26\n" + "ldr d6, [x15, #0x120]\n" + "ldr x28, [x15, #0x128]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0x130]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "mov v6.d[1], x28\n" + "ldr x27, [x15, #0x138]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr d4, [x15, #0x140]\n" + "ldr x26, [x15, #0x148]\n" + "mov v7.d[1], x27\n" + "ldr d5, [x15, #0x150]\n" + "ldr x9, [x15, #0x158]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "mov v4.d[1], x26\n" + "ldr d6, [x15, #0x160]\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "mov v5.d[1], x9\n" + "ldr x28, [x15, #0x168]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "ldr d7, [x15, #0x170]\n" + "ldr x27, [x15, #0x178]\n" + "add x15, x15, #0x180\n" + "mov v6.d[1], x28\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr x25, [x10, #0x8]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "mov v7.d[1], x27\n" + "ldr d4, [x15, #0x0]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x10, #0x0]\n" + "ldr x26, [x15, #0x8]\n" + "mov v0.d[1], x25\n" + "mov v4.d[1], x26\n" + "bge 22b\n" + "23:" // Height 1: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "sub x11, x11, #0x4\n" + "ldr q7, [x15, #0x30]\n" + "add x10, x10, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x15, #0x80]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr q5, [x15, #0x90]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ldr q4, [x15, #0xc0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "ldr q5, [x15, #0xd0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "ldr q4, [x15, #0x100]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr q5, [x15, #0x110]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "ldr q4, [x15, #0x140]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr q5, [x15, #0x150]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "add x15, x15, #0x180\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "24:" // Height 1: Multiply loop: Main loop skip + "cbz x11, 26f\n" + "25:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "sub x11, x11, #0x1\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "add x15, x15, #0x60\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "cbnz x11, 25b\n" + "26:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 19b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "27:" // Height 1: No activation + "cmp x16, #0x18\n" + "bge 40f\n" + "tbz x16, #4, 31f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v11.4s }, [x13], #0x10\n" + "tbz x16, #2, 29f\n" + "st1 { v12.4s }, [x13], #0x10\n" + "tbz x16, #1, 28f\n" + "str d13, [x13], #0x8\n" + "tbz x16, #0, 39f\n" + "st1 { v13.s }[2], [x13]\n" + "b 39f\n" + "28:" // Height 1: Partial direct writeback: partial_1_20 + "tbz x16, #0, 39f\n" + "str s13, [x13, #0x0]\n" + "b 39f\n" + "29:" // Height 1: Partial direct writeback: partial_2_16 + "tbz x16, #1, 30f\n" + "str d12, [x13], #0x8\n" + "tbz x16, #0, 39f\n" + "st1 { v12.s }[2], [x13]\n" + "b 39f\n" + "30:" // Height 1: Partial direct writeback: partial_1_16 + "tbz x16, #0, 39f\n" + "str s12, [x13, #0x0]\n" + "b 39f\n" + "31:" // Height 1: Partial direct writeback: partial_8_0 + "tbz x16, #3, 35f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "tbz x16, #2, 33f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "tbz x16, #1, 32f\n" + "str d11, [x13], #0x8\n" + "tbz x16, #0, 39f\n" + "st1 { v11.s }[2], [x13]\n" + "b 39f\n" + "32:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x16, #0, 39f\n" + "str s11, [x13, #0x0]\n" + "b 39f\n" + "33:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x16, #1, 34f\n" + "str d10, [x13], #0x8\n" + "tbz x16, #0, 39f\n" + "st1 { v10.s }[2], [x13]\n" + "b 39f\n" + "34:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x16, #0, 39f\n" + "str s10, [x13, #0x0]\n" + "b 39f\n" + "35:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x16, #2, 37f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "tbz x16, #1, 36f\n" + "str d9, [x13], #0x8\n" + "tbz x16, #0, 39f\n" + "st1 { v9.s }[2], [x13]\n" + "b 39f\n" + "36:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x16, #0, 39f\n" + "str s9, [x13, #0x0]\n" + "b 39f\n" + "37:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x16, #1, 38f\n" + "str d8, [x13], #0x8\n" + "tbz x16, #0, 39f\n" + "st1 { v8.s }[2], [x13]\n" + "b 39f\n" + "38:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "39:" // Height 1: Partial direct writeback: Done + "b 41f\n" + "40:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x13, #0x40]\n" + "str q13, [x13, #0x50]\n" + "add x13, x13, #0x60\n" + "41:" // Height 1: Writeback done + "subs x16, x16, #0x18\n" + "bgt 2b\n" + "b 166f\n" + "42:" // Height 2 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "43:" // Height 2: Column loop + "cbz x14, 44f\n" + "ldr q8, [x14, #0x0]\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "mov v14.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v15.16b, v9.16b\n" + "ldr q12, [x14, #0x40]\n" + "mov v16.16b, v10.16b\n" + "ldr q13, [x14, #0x50]\n" + "mov v17.16b, v11.16b\n" + "add x14, x14, #0x60\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v13.16b\n" + "b 59f\n" + "44:" // Height 2: no bias + "tbz %x[flags], #0, 58f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x16, #0x18\n" + "add x23, x13, x19, LSL #2\n" + "bge 57f\n" + "tbz x16, #4, 48f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v15.4s }, [x23], #0x10\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v11.4s }, [x13], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "tbz x16, #2, 46f\n" + "ld1 { v12.4s }, [x13], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "tbz x16, #1, 45f\n" + "mov x19, #0x58\n" + "ldr d13, [x13], #0x8\n" + "ldr d19, [x23], #0x8\n" + "tbz x16, #0, 56f\n" + "ld1 { v13.s }[2], [x13]\n" + "ld1 { v19.s }[2], [x23]\n" + "b 56f\n" + "45:" // Height 2: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x16, #0, 56f\n" + "ldr s13, [x13, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "b 56f\n" + "46:" // Height 2: Partial accumulate: partial_2_16 + "tbz x16, #1, 47f\n" + "ldr d12, [x13], #0x8\n" + "ldr d18, [x23], #0x8\n" + "mov x19, #0x48\n" + "tbz x16, #0, 56f\n" + "ld1 { v12.s }[2], [x13]\n" + "ld1 { v18.s }[2], [x23]\n" + "b 56f\n" + "47:" // Height 2: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x16, #0, 56f\n" + "ldr s12, [x13, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "b 56f\n" + "48:" // Height 2: Partial accumulate: partial_8_0 + "tbz x16, #3, 52f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v15.4s }, [x23], #0x10\n" + "tbz x16, #2, 50f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "tbz x16, #1, 49f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d17, [x23], #0x8\n" + "tbz x16, #0, 56f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v17.s }[2], [x23]\n" + "b 56f\n" + "49:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 56f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "b 56f\n" + "50:" // Height 2: Partial accumulate: partial_2_8 + "tbz x16, #1, 51f\n" + "ldr d10, [x13], #0x8\n" + "ldr d16, [x23], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 56f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v16.s }[2], [x23]\n" + "b 56f\n" + "51:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 56f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "b 56f\n" + "52:" // Height 2: Partial accumulate: partial_4_0 + "tbz x16, #2, 54f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "tbz x16, #1, 53f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d15, [x23], #0x8\n" + "tbz x16, #0, 56f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x23]\n" + "b 56f\n" + "53:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 56f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "b 56f\n" + "54:" // Height 2: Partial accumulate: partial_2_0 + "tbz x16, #1, 55f\n" + "ldr d8, [x13], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 56f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x23]\n" + "b 56f\n" + "55:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x13, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x23, #0x0]\n" + "56:" // Height 2: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 59f\n" + "57:" // Height 2: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x13, #0x40]\n" + "ldr q13, [x13, #0x50]\n" + "ldr q14, [x23, #0x0]\n" + "ldr q15, [x23, #0x10]\n" + "ldr q16, [x23, #0x20]\n" + "ldr q17, [x23, #0x30]\n" + "ldr q18, [x23, #0x40]\n" + "ldr q19, [x23, #0x50]\n" + "b 59f\n" + "58:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "59:" // Height 2: setup done + "mov x12, #0x0\n" + "60:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 61f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x12, 62f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 62f\n" + "61:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x24, x10, x19, LSL #2\n" + "62:" // Height 2: input setup done + "cmp x11, #0x4\n" + "blt 65f\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x11, #0x8\n" + "ldr q4, [x15, #0x0]\n" + "blt 64f\n" + "63:" // Height 2: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr d5, [x15, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr x9, [x15, #0x18]\n" + "ldr d6, [x15, #0x20]\n" + "add x10, x10, #0x10\n" + "ldr x28, [x15, #0x28]\n" + "add x24, x24, #0x10\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0x30]\n" + "ldr x27, [x15, #0x38]\n" + "sub x11, x11, #0x4\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "mov v6.d[1], x28\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr d4, [x15, #0x40]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "mov v7.d[1], x27\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "ldr x26, [x15, #0x48]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "ldr d5, [x15, #0x50]\n" + "mov v4.d[1], x26\n" + "ldr x9, [x15, #0x58]\n" + "ldr d6, [x15, #0x60]\n" + "cmp x11, #0x8\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "ldr x28, [x15, #0x68]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "mov v5.d[1], x9\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "mov v6.d[1], x28\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "ldr d7, [x15, #0x70]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr x27, [x15, #0x78]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr d4, [x15, #0x80]\n" + "ldr x26, [x15, #0x88]\n" + "mov v7.d[1], x27\n" + "ldr d5, [x15, #0x90]\n" + "ldr x9, [x15, #0x98]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "mov v4.d[1], x26\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr d6, [x15, #0xa0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "mov v5.d[1], x9\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "ldr x28, [x15, #0xa8]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ldr d7, [x15, #0xb0]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "ldr x27, [x15, #0xb8]\n" + "mov v6.d[1], x28\n" + "ldr d4, [x15, #0xc0]\n" + "ldr x26, [x15, #0xc8]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "mov v7.d[1], x27\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ldr d5, [x15, #0xd0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "mov v4.d[1], x26\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "ldr x9, [x15, #0xd8]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "ldr d6, [x15, #0xe0]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "ldr x28, [x15, #0xe8]\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0xf0]\n" + "ldr x27, [x15, #0xf8]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "mov v6.d[1], x28\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "ldr d4, [x15, #0x100]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "mov v7.d[1], x27\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "ldr x26, [x15, #0x108]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr d5, [x15, #0x110]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "ldr x9, [x15, #0x118]\n" + "mov v4.d[1], x26\n" + "ldr d6, [x15, #0x120]\n" + "ldr x28, [x15, #0x128]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "mov v5.d[1], x9\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "ldr d7, [x15, #0x130]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "mov v6.d[1], x28\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "ldr x27, [x15, #0x138]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr d4, [x15, #0x140]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr x26, [x15, #0x148]\n" + "mov v7.d[1], x27\n" + "ldr d5, [x15, #0x150]\n" + "ldr x9, [x15, #0x158]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "mov v4.d[1], x26\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr d6, [x15, #0x160]\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "mov v5.d[1], x9\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "ldr x28, [x15, #0x168]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "ldr d7, [x15, #0x170]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "ldr x27, [x15, #0x178]\n" + "mov v6.d[1], x28\n" + "ldr x25, [x10, #0x8]\n" + "ldr x23, [x24, #0x8]\n" + "add x15, x15, #0x180\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "mov v7.d[1], x27\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "ldr d4, [x15, #0x0]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr x26, [x15, #0x8]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "ldr d0, [x10, #0x0]\n" + "ldr d1, [x24, #0x0]\n" + "mov v4.d[1], x26\n" + "mov v0.d[1], x25\n" + "mov v1.d[1], x23\n" + "bge 63b\n" + "64:" // Height 2: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x15, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "sub x11, x11, #0x4\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x10, x10, #0x10\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x15, #0x80]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "ldr q5, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "ldr q4, [x15, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "ldr q5, [x15, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "ldr q4, [x15, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "ldr q5, [x15, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "ldr q4, [x15, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "ldr q5, [x15, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "add x15, x15, #0x180\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "65:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 67f\n" + "66:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "sub x11, x11, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x60\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "cbnz x11, 66b\n" + "67:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 60b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "add x23, x13, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 68f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "68:" // Height 2: No activation + "cmp x16, #0x18\n" + "bge 81f\n" + "tbz x16, #4, 72f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v11.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "tbz x16, #2, 70f\n" + "st1 { v12.4s }, [x13], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "tbz x16, #1, 69f\n" + "str d13, [x13], #0x8\n" + "str d19, [x23], #0x8\n" + "tbz x16, #0, 80f\n" + "st1 { v13.s }[2], [x13]\n" + "st1 { v19.s }[2], [x23]\n" + "b 80f\n" + "69:" // Height 2: Partial direct writeback: partial_1_20 + "tbz x16, #0, 80f\n" + "str s13, [x13, #0x0]\n" + "str s19, [x23, #0x0]\n" + "b 80f\n" + "70:" // Height 2: Partial direct writeback: partial_2_16 + "tbz x16, #1, 71f\n" + "str d12, [x13], #0x8\n" + "str d18, [x23], #0x8\n" + "tbz x16, #0, 80f\n" + "st1 { v12.s }[2], [x13]\n" + "st1 { v18.s }[2], [x23]\n" + "b 80f\n" + "71:" // Height 2: Partial direct writeback: partial_1_16 + "tbz x16, #0, 80f\n" + "str s12, [x13, #0x0]\n" + "str s18, [x23, #0x0]\n" + "b 80f\n" + "72:" // Height 2: Partial direct writeback: partial_8_0 + "tbz x16, #3, 76f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "tbz x16, #2, 74f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "tbz x16, #1, 73f\n" + "str d11, [x13], #0x8\n" + "str d17, [x23], #0x8\n" + "tbz x16, #0, 80f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v17.s }[2], [x23]\n" + "b 80f\n" + "73:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x16, #0, 80f\n" + "str s11, [x13, #0x0]\n" + "str s17, [x23, #0x0]\n" + "b 80f\n" + "74:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x16, #1, 75f\n" + "str d10, [x13], #0x8\n" + "str d16, [x23], #0x8\n" + "tbz x16, #0, 80f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v16.s }[2], [x23]\n" + "b 80f\n" + "75:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x16, #0, 80f\n" + "str s10, [x13, #0x0]\n" + "str s16, [x23, #0x0]\n" + "b 80f\n" + "76:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x16, #2, 78f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "tbz x16, #1, 77f\n" + "str d9, [x13], #0x8\n" + "str d15, [x23], #0x8\n" + "tbz x16, #0, 80f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v15.s }[2], [x23]\n" + "b 80f\n" + "77:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x16, #0, 80f\n" + "str s9, [x13, #0x0]\n" + "str s15, [x23, #0x0]\n" + "b 80f\n" + "78:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x16, #1, 79f\n" + "str d8, [x13], #0x8\n" + "str d14, [x23], #0x8\n" + "tbz x16, #0, 80f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v14.s }[2], [x23]\n" + "b 80f\n" + "79:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s14, [x23, #0x0]\n" + "80:" // Height 2: Partial direct writeback: Done + "b 82f\n" + "81:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x13, #0x40]\n" + "str q13, [x13, #0x50]\n" + "add x13, x13, #0x60\n" + "str q14, [x23, #0x0]\n" + "str q15, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "str q17, [x23, #0x30]\n" + "str q18, [x23, #0x40]\n" + "str q19, [x23, #0x50]\n" + "82:" // Height 2: Writeback done + "subs x16, x16, #0x18\n" + "bgt 43b\n" + "b 166f\n" + "83:" // Height 3 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "84:" // Height 3: Column loop + "cbz x14, 85f\n" + "ldr q8, [x14, #0x0]\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "mov v14.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v15.16b, v9.16b\n" + "ldr q12, [x14, #0x40]\n" + "mov v16.16b, v10.16b\n" + "ldr q13, [x14, #0x50]\n" + "mov v17.16b, v11.16b\n" + "add x14, x14, #0x60\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v13.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v24.16b, v12.16b\n" + "mov v25.16b, v13.16b\n" + "b 100f\n" + "85:" // Height 3: no bias + "tbz %x[flags], #0, 99f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x16, #0x18\n" + "add x23, x13, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 98f\n" + "tbz x16, #4, 89f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v15.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v11.4s }, [x13], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v23.4s }, [x22], #0x10\n" + "tbz x16, #2, 87f\n" + "ld1 { v12.4s }, [x13], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v24.4s }, [x22], #0x10\n" + "tbz x16, #1, 86f\n" + "ldr d13, [x13], #0x8\n" + "mov x19, #0x58\n" + "ldr d19, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "tbz x16, #0, 97f\n" + "ld1 { v13.s }[2], [x13]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v25.s }[2], [x22]\n" + "b 97f\n" + "86:" // Height 3: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x16, #0, 97f\n" + "ldr s13, [x13, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s25, [x22, #0x0]\n" + "b 97f\n" + "87:" // Height 3: Partial accumulate: partial_2_16 + "tbz x16, #1, 88f\n" + "ldr d12, [x13], #0x8\n" + "ldr d18, [x23], #0x8\n" + "mov x19, #0x48\n" + "ldr d24, [x22], #0x8\n" + "tbz x16, #0, 97f\n" + "ld1 { v12.s }[2], [x13]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "b 97f\n" + "88:" // Height 3: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x16, #0, 97f\n" + "ldr s12, [x13, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "b 97f\n" + "89:" // Height 3: Partial accumulate: partial_8_0 + "tbz x16, #3, 93f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v15.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "tbz x16, #2, 91f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "tbz x16, #1, 90f\n" + "ldr d11, [x13], #0x8\n" + "mov x19, #0x38\n" + "ldr d17, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "tbz x16, #0, 97f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "b 97f\n" + "90:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 97f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "b 97f\n" + "91:" // Height 3: Partial accumulate: partial_2_8 + "tbz x16, #1, 92f\n" + "ldr d10, [x13], #0x8\n" + "ldr d16, [x23], #0x8\n" + "mov x19, #0x28\n" + "ldr d22, [x22], #0x8\n" + "tbz x16, #0, 97f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "b 97f\n" + "92:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 97f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "b 97f\n" + "93:" // Height 3: Partial accumulate: partial_4_0 + "tbz x16, #2, 95f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "tbz x16, #1, 94f\n" + "ldr d9, [x13], #0x8\n" + "mov x19, #0x18\n" + "ldr d15, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "tbz x16, #0, 97f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "b 97f\n" + "94:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 97f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "b 97f\n" + "95:" // Height 3: Partial accumulate: partial_2_0 + "tbz x16, #1, 96f\n" + "ldr d8, [x13], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x19, #0x8\n" + "ldr d20, [x22], #0x8\n" + "tbz x16, #0, 97f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "b 97f\n" + "96:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x13, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "97:" // Height 3: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 100f\n" + "98:" // Height 3: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x13, #0x40]\n" + "ldr q13, [x13, #0x50]\n" + "ldr q14, [x23, #0x0]\n" + "ldr q15, [x23, #0x10]\n" + "ldr q16, [x23, #0x20]\n" + "ldr q17, [x23, #0x30]\n" + "ldr q18, [x23, #0x40]\n" + "ldr q19, [x23, #0x50]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x22, #0x40]\n" + "ldr q25, [x22, #0x50]\n" + "b 100f\n" + "99:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "100:" // Height 3: setup done + "mov x12, #0x0\n" + "101:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 102f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "cbnz x12, 103f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 103f\n" + "102:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x24, x10, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "103:" // Height 3: input setup done + "cmp x11, #0x4\n" + "blt 106f\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x11, #0x8\n" + "ldr q2, [x22, #0x0]\n" + "ldr q4, [x15, #0x0]\n" + "blt 105f\n" + "104:" // Height 3: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr d5, [x15, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr x9, [x15, #0x18]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr d6, [x15, #0x20]\n" + "ldr x28, [x15, #0x28]\n" + "add x10, x10, #0x10\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0x30]\n" + "ldr x27, [x15, #0x38]\n" + "add x24, x24, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "mov v6.d[1], x28\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr d4, [x15, #0x40]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "mov v7.d[1], x27\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr x26, [x15, #0x48]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "ldr d5, [x15, #0x50]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "mov v4.d[1], x26\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "ldr x9, [x15, #0x58]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "ldr d6, [x15, #0x60]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "mov v5.d[1], x9\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "ldr x28, [x15, #0x68]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "ldr d7, [x15, #0x70]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "ldr x27, [x15, #0x78]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "mov v6.d[1], x28\n" + "ldr d4, [x15, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "mov v7.d[1], x27\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "ldr x26, [x15, #0x88]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr d5, [x15, #0x90]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr x9, [x15, #0x98]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "mov v4.d[1], x26\n" + "ldr d6, [x15, #0xa0]\n" + "sub x11, x11, #0x4\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "mov v5.d[1], x9\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "ldr x28, [x15, #0xa8]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "ldr d7, [x15, #0xb0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ldr x27, [x15, #0xb8]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "mov v6.d[1], x28\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "ldr d4, [x15, #0xc0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "mov v7.d[1], x27\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ldr x26, [x15, #0xc8]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "ldr d5, [x15, #0xd0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "ldr x9, [x15, #0xd8]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "mov v4.d[1], x26\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "ldr d6, [x15, #0xe0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "mov v5.d[1], x9\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "ldr x28, [x15, #0xe8]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "ldr d7, [x15, #0xf0]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "ldr x27, [x15, #0xf8]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "mov v6.d[1], x28\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "ldr d4, [x15, #0x100]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "mov v7.d[1], x27\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "ldr x26, [x15, #0x108]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "ldr d5, [x15, #0x110]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr x9, [x15, #0x118]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "mov v4.d[1], x26\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "ldr d6, [x15, #0x120]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "mov v5.d[1], x9\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "ldr x28, [x15, #0x128]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "ldr d7, [x15, #0x130]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "ldr x27, [x15, #0x138]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "mov v6.d[1], x28\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "ldr d4, [x15, #0x140]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "mov v7.d[1], x27\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr x26, [x15, #0x148]\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "ldr d5, [x15, #0x150]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr x9, [x15, #0x158]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "mov v4.d[1], x26\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "ldr d6, [x15, #0x160]\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "mov v5.d[1], x9\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "ldr x28, [x15, #0x168]\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "ldr d7, [x15, #0x170]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "ldr x27, [x15, #0x178]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "mov v6.d[1], x28\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "ldr x25, [x10, #0x8]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "mov v7.d[1], x27\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "ldr x21, [x22, #0x8]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x10, #0x0]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "ldr d1, [x24, #0x0]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "ldr d2, [x22, #0x0]\n" + "mov v0.d[1], x25\n" + "cmp x11, #0x8\n" + "mov v1.d[1], x23\n" + "add x15, x15, #0x180\n" + "mov v2.d[1], x21\n" + "ldr d4, [x15, #0x0]\n" + "ldr x26, [x15, #0x8]\n" + "mov v4.d[1], x26\n" + "bge 104b\n" + "105:" // Height 3: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x15, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "sub x11, x11, #0x4\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x10, x10, #0x10\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "add x22, x22, #0x10\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "ldr q4, [x15, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "ldr q5, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "ldr q4, [x15, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "ldr q5, [x15, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "ldr q4, [x15, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "ldr q5, [x15, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "ldr q4, [x15, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "ldr q5, [x15, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "add x15, x15, #0x180\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "106:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 108f\n" + "107:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "sub x11, x11, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x60\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "cbnz x11, 107b\n" + "108:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 101b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "add x23, x13, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 109f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "109:" // Height 3: No activation + "cmp x16, #0x18\n" + "bge 122f\n" + "tbz x16, #4, 113f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v11.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "st1 { v23.4s }, [x22], #0x10\n" + "tbz x16, #2, 111f\n" + "st1 { v12.4s }, [x13], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "tbz x16, #1, 110f\n" + "str d13, [x13], #0x8\n" + "str d19, [x23], #0x8\n" + "str d25, [x22], #0x8\n" + "tbz x16, #0, 121f\n" + "st1 { v13.s }[2], [x13]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v25.s }[2], [x22]\n" + "b 121f\n" + "110:" // Height 3: Partial direct writeback: partial_1_20 + "tbz x16, #0, 121f\n" + "str s13, [x13, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s25, [x22, #0x0]\n" + "b 121f\n" + "111:" // Height 3: Partial direct writeback: partial_2_16 + "tbz x16, #1, 112f\n" + "str d12, [x13], #0x8\n" + "str d18, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "tbz x16, #0, 121f\n" + "st1 { v12.s }[2], [x13]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v24.s }[2], [x22]\n" + "b 121f\n" + "112:" // Height 3: Partial direct writeback: partial_1_16 + "tbz x16, #0, 121f\n" + "str s12, [x13, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s24, [x22, #0x0]\n" + "b 121f\n" + "113:" // Height 3: Partial direct writeback: partial_8_0 + "tbz x16, #3, 117f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "tbz x16, #2, 115f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "tbz x16, #1, 114f\n" + "str d11, [x13], #0x8\n" + "str d17, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "tbz x16, #0, 121f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "b 121f\n" + "114:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x16, #0, 121f\n" + "str s11, [x13, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "b 121f\n" + "115:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x16, #1, 116f\n" + "str d10, [x13], #0x8\n" + "str d16, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "tbz x16, #0, 121f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "b 121f\n" + "116:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x16, #0, 121f\n" + "str s10, [x13, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "b 121f\n" + "117:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x16, #2, 119f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "tbz x16, #1, 118f\n" + "str d9, [x13], #0x8\n" + "str d15, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "tbz x16, #0, 121f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "b 121f\n" + "118:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x16, #0, 121f\n" + "str s9, [x13, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "b 121f\n" + "119:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x16, #1, 120f\n" + "str d8, [x13], #0x8\n" + "str d14, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "tbz x16, #0, 121f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "b 121f\n" + "120:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "121:" // Height 3: Partial direct writeback: Done + "b 123f\n" + "122:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x13, #0x40]\n" + "str q13, [x13, #0x50]\n" + "add x13, x13, #0x60\n" + "str q14, [x23, #0x0]\n" + "str q15, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "str q17, [x23, #0x30]\n" + "str q18, [x23, #0x40]\n" + "str q19, [x23, #0x50]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x22, #0x40]\n" + "str q25, [x22, #0x50]\n" + "123:" // Height 3: Writeback done + "subs x16, x16, #0x18\n" + "bgt 84b\n" + "b 166f\n" + "124:" // Height 4 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x10\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "125:" // Height 4: Column loop + "cbz x14, 126f\n" + "ldr q8, [x14, #0x0]\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "mov v14.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v15.16b, v9.16b\n" + "ldr q12, [x14, #0x40]\n" + "mov v16.16b, v10.16b\n" + "ldr q13, [x14, #0x50]\n" + "mov v17.16b, v11.16b\n" + "add x14, x14, #0x60\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v13.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v24.16b, v12.16b\n" + "mov v25.16b, v13.16b\n" + "mov v26.16b, v8.16b\n" + "mov v27.16b, v9.16b\n" + "mov v28.16b, v10.16b\n" + "mov v29.16b, v11.16b\n" + "mov v30.16b, v12.16b\n" + "mov v31.16b, v13.16b\n" + "b 141f\n" + "126:" // Height 4: no bias + "tbz %x[flags], #0, 140f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x16, #0x18\n" + "add x23, x13, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 139f\n" + "tbz x16, #4, 130f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v15.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v11.4s }, [x13], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v23.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "ld1 { v29.4s }, [x21], #0x10\n" + "tbz x16, #2, 128f\n" + "ld1 { v12.4s }, [x13], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v24.4s }, [x22], #0x10\n" + "ld1 { v30.4s }, [x21], #0x10\n" + "tbz x16, #1, 127f\n" + "ldr d13, [x13], #0x8\n" + "mov x19, #0x58\n" + "ldr d19, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x16, #0, 138f\n" + "ld1 { v13.s }[2], [x13]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v25.s }[2], [x22]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 138f\n" + "127:" // Height 4: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x16, #0, 138f\n" + "ldr s13, [x13, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s25, [x22, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "b 138f\n" + "128:" // Height 4: Partial accumulate: partial_2_16 + "tbz x16, #1, 129f\n" + "ldr d12, [x13], #0x8\n" + "ldr d18, [x23], #0x8\n" + "mov x19, #0x48\n" + "ldr d24, [x22], #0x8\n" + "ldr d30, [x21], #0x8\n" + "tbz x16, #0, 138f\n" + "ld1 { v12.s }[2], [x13]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v30.s }[2], [x21]\n" + "b 138f\n" + "129:" // Height 4: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x16, #0, 138f\n" + "ldr s12, [x13, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "b 138f\n" + "130:" // Height 4: Partial accumulate: partial_8_0 + "tbz x16, #3, 134f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v15.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x21], #0x10\n" + "tbz x16, #2, 132f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "tbz x16, #1, 131f\n" + "ldr d11, [x13], #0x8\n" + "mov x19, #0x38\n" + "ldr d17, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x16, #0, 138f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v29.s }[2], [x21]\n" + "b 138f\n" + "131:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 138f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s29, [x21, #0x0]\n" + "b 138f\n" + "132:" // Height 4: Partial accumulate: partial_2_8 + "tbz x16, #1, 133f\n" + "ldr d10, [x13], #0x8\n" + "ldr d16, [x23], #0x8\n" + "mov x19, #0x28\n" + "ldr d22, [x22], #0x8\n" + "ldr d28, [x21], #0x8\n" + "tbz x16, #0, 138f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v28.s }[2], [x21]\n" + "b 138f\n" + "133:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 138f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s28, [x21, #0x0]\n" + "b 138f\n" + "134:" // Height 4: Partial accumulate: partial_4_0 + "tbz x16, #2, 136f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "tbz x16, #1, 135f\n" + "ldr d9, [x13], #0x8\n" + "mov x19, #0x18\n" + "ldr d15, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "tbz x16, #0, 138f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "b 138f\n" + "135:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 138f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "b 138f\n" + "136:" // Height 4: Partial accumulate: partial_2_0 + "tbz x16, #1, 137f\n" + "ldr d8, [x13], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x19, #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz x16, #0, 138f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "b 138f\n" + "137:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x13, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "138:" // Height 4: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 141f\n" + "139:" // Height 4: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x13, #0x40]\n" + "ldr q13, [x13, #0x50]\n" + "ldr q14, [x23, #0x0]\n" + "ldr q15, [x23, #0x10]\n" + "ldr q16, [x23, #0x20]\n" + "ldr q17, [x23, #0x30]\n" + "ldr q18, [x23, #0x40]\n" + "ldr q19, [x23, #0x50]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x22, #0x40]\n" + "ldr q25, [x22, #0x50]\n" + "ldr q26, [x21, #0x0]\n" + "ldr q27, [x21, #0x10]\n" + "ldr q28, [x21, #0x20]\n" + "ldr q29, [x21, #0x30]\n" + "ldr q30, [x21, #0x40]\n" + "ldr q31, [x21, #0x50]\n" + "b 141f\n" + "140:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "141:" // Height 4: setup done + "mov x12, #0x0\n" + "142:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 143f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x12, 144f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 144f\n" + "143:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x24, x10, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "add x20, x22, x19, LSL #2\n" + "144:" // Height 4: input setup done + "cmp x11, #0x4\n" + "blt 147f\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x11, #0x8\n" + "ldr q2, [x22, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x15, #0x0]\n" + "blt 146f\n" + "145:" // Height 4: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr d5, [x15, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr x9, [x15, #0x18]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr d6, [x15, #0x20]\n" + "fmla v26.4s, v4.4s, v3.s[0]\n" + "ldr x28, [x15, #0x28]\n" + "mov v5.d[1], x9\n" + "ldr d7, [x15, #0x30]\n" + "ldr x27, [x15, #0x38]\n" + "add x10, x10, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "mov v6.d[1], x28\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr d4, [x15, #0x40]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "mov v7.d[1], x27\n" + "fmla v27.4s, v5.4s, v3.s[0]\n" + "ldr x26, [x15, #0x48]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "ldr d5, [x15, #0x50]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "mov v4.d[1], x26\n" + "fmla v28.4s, v6.4s, v3.s[0]\n" + "ldr x9, [x15, #0x58]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr d6, [x15, #0x60]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "ldr x28, [x15, #0x68]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "mov v5.d[1], x9\n" + "fmla v29.4s, v7.4s, v3.s[0]\n" + "ldr d7, [x15, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "mov v6.d[1], x28\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "ldr x27, [x15, #0x78]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "ldr x26, [x15, #0x88]\n" + "fmla v30.4s, v4.4s, v3.s[0]\n" + "ldr d4, [x15, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "mov v7.d[1], x27\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "ldr x9, [x15, #0x98]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "mov v4.d[1], x26\n" + "fmla v31.4s, v5.4s, v3.s[0]\n" + "ldr d5, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr x28, [x15, #0xa8]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr x27, [x15, #0xb8]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "mov v5.d[1], x9\n" + "fmla v26.4s, v6.4s, v3.s[1]\n" + "ldr d6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr x26, [x15, #0xc8]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr x9, [x15, #0xd8]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x28\n" + "fmla v27.4s, v7.4s, v3.s[1]\n" + "ldr d7, [x15, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "ldr x28, [x15, #0xe8]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "ldr x25, [x10, #0x8]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "mov v7.d[1], x27\n" + "fmla v28.4s, v4.4s, v3.s[1]\n" + "ldr d4, [x15, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ldr x27, [x15, #0xf8]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "add x24, x24, #0x10\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "mov v4.d[1], x26\n" + "fmla v29.4s, v5.4s, v3.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "ldr d5, [x15, #0xd0]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ldr x26, [x15, #0x108]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v30.4s, v6.4s, v3.s[1]\n" + "mov v5.d[1], x9\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "ldr d6, [x15, #0xe0]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "ldr x9, [x15, #0x118]\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "add x22, x22, #0x10\n" + "fmla v31.4s, v7.4s, v3.s[1]\n" + "mov v6.d[1], x28\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "ldr d7, [x15, #0xf0]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "ldr x28, [x15, #0x128]\n" + "fmla v26.4s, v4.4s, v3.s[2]\n" + "ldr d4, [x15, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "mov v7.d[1], x27\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "ldr x27, [x15, #0x138]\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "mov v4.d[1], x26\n" + "fmla v27.4s, v5.4s, v3.s[2]\n" + "ldr d5, [x15, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr x26, [x15, #0x148]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "ldr x21, [x22, #0x8]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "mov v5.d[1], x9\n" + "fmla v28.4s, v6.4s, v3.s[2]\n" + "ldr d6, [x15, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr x9, [x15, #0x158]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "add x20, x20, #0x10\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x28\n" + "fmla v29.4s, v7.4s, v3.s[2]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "ldr d7, [x15, #0x130]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "ldr x28, [x15, #0x168]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "ldr x19, [x20, #0x8]\n" + "fmla v30.4s, v4.4s, v3.s[2]\n" + "mov v7.d[1], x27\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "ldr d4, [x15, #0x140]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "ldr x27, [x15, #0x178]\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "sub x11, x11, #0x4\n" + "fmla v31.4s, v5.4s, v3.s[2]\n" + "mov v4.d[1], x26\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr d5, [x15, #0x150]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "cmp x11, #0x8\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "fmla v26.4s, v6.4s, v3.s[3]\n" + "mov v5.d[1], x9\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr d6, [x15, #0x160]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "fmla v27.4s, v7.4s, v3.s[3]\n" + "mov v6.d[1], x28\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "ldr d7, [x15, #0x170]\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "add x15, x15, #0x180\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "ldr x26, [x15, #0x8]\n" + "fmla v28.4s, v4.4s, v3.s[3]\n" + "mov v7.d[1], x27\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "ldr d4, [x15, #0x0]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v29.4s, v5.4s, v3.s[3]\n" + "mov v4.d[1], x26\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v30.4s, v6.4s, v3.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x10, #0x0]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "ldr d1, [x24, #0x0]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "ldr d2, [x22, #0x0]\n" + "fmla v31.4s, v7.4s, v3.s[3]\n" + "mov v0.d[1], x25\n" + "mov v1.d[1], x23\n" + "ldr d3, [x20, #0x0]\n" + "mov v2.d[1], x21\n" + "mov v3.d[1], x19\n" + "bge 145b\n" + "146:" // Height 4: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x15, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v26.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "sub x11, x11, #0x4\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v27.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "add x22, x22, #0x10\n" + "fmla v28.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "add x20, x20, #0x10\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v29.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v30.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x15, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v31.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "fmla v26.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "fmla v27.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "fmla v28.4s, v4.4s, v3.s[1]\n" + "ldr q4, [x15, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "fmla v29.4s, v5.4s, v3.s[1]\n" + "ldr q5, [x15, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "fmla v30.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "fmla v31.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "fmla v26.4s, v4.4s, v3.s[2]\n" + "ldr q4, [x15, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "fmla v27.4s, v5.4s, v3.s[2]\n" + "ldr q5, [x15, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "fmla v28.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "fmla v29.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "fmla v30.4s, v4.4s, v3.s[2]\n" + "ldr q4, [x15, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "fmla v31.4s, v5.4s, v3.s[2]\n" + "ldr q5, [x15, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "fmla v26.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "fmla v27.4s, v7.4s, v3.s[3]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "add x15, x15, #0x180\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "fmla v28.4s, v4.4s, v3.s[3]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v29.4s, v5.4s, v3.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v30.4s, v6.4s, v3.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "fmla v31.4s, v7.4s, v3.s[3]\n" + "147:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 149f\n" + "148:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "sub x11, x11, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr s3, [x20], #0x4\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "fmla v26.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x15, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "fmla v27.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x15, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x60\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "fmla v28.4s, v6.4s, v3.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "fmla v29.4s, v7.4s, v3.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v30.4s, v4.4s, v3.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v31.4s, v5.4s, v3.s[0]\n" + "cbnz x11, 148b\n" + "149:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 142b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "add x23, x13, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 150f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "150:" // Height 4: No activation + "cmp x16, #0x18\n" + "bge 163f\n" + "tbz x16, #4, 154f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v11.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "st1 { v23.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "st1 { v27.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "tbz x16, #2, 152f\n" + "st1 { v12.4s }, [x13], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "st1 { v30.4s }, [x21], #0x10\n" + "tbz x16, #1, 151f\n" + "str d13, [x13], #0x8\n" + "str d19, [x23], #0x8\n" + "str d25, [x22], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x16, #0, 162f\n" + "st1 { v13.s }[2], [x13]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v25.s }[2], [x22]\n" + "st1 { v31.s }[2], [x21]\n" + "b 162f\n" + "151:" // Height 4: Partial direct writeback: partial_1_20 + "tbz x16, #0, 162f\n" + "str s13, [x13, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s25, [x22, #0x0]\n" + "str s31, [x21, #0x0]\n" + "b 162f\n" + "152:" // Height 4: Partial direct writeback: partial_2_16 + "tbz x16, #1, 153f\n" + "str d12, [x13], #0x8\n" + "str d18, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x16, #0, 162f\n" + "st1 { v12.s }[2], [x13]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v24.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "b 162f\n" + "153:" // Height 4: Partial direct writeback: partial_1_16 + "tbz x16, #0, 162f\n" + "str s12, [x13, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s24, [x22, #0x0]\n" + "str s30, [x21, #0x0]\n" + "b 162f\n" + "154:" // Height 4: Partial direct writeback: partial_8_0 + "tbz x16, #3, 158f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "st1 { v27.4s }, [x21], #0x10\n" + "tbz x16, #2, 156f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "tbz x16, #1, 155f\n" + "str d11, [x13], #0x8\n" + "str d17, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x16, #0, 162f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "st1 { v29.s }[2], [x21]\n" + "b 162f\n" + "155:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x16, #0, 162f\n" + "str s11, [x13, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "str s29, [x21, #0x0]\n" + "b 162f\n" + "156:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x16, #1, 157f\n" + "str d10, [x13], #0x8\n" + "str d16, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x16, #0, 162f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v28.s }[2], [x21]\n" + "b 162f\n" + "157:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x16, #0, 162f\n" + "str s10, [x13, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s28, [x21, #0x0]\n" + "b 162f\n" + "158:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x16, #2, 160f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "tbz x16, #1, 159f\n" + "str d9, [x13], #0x8\n" + "str d15, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "tbz x16, #0, 162f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "b 162f\n" + "159:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x16, #0, 162f\n" + "str s9, [x13, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "b 162f\n" + "160:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x16, #1, 161f\n" + "str d8, [x13], #0x8\n" + "str d14, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "tbz x16, #0, 162f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "b 162f\n" + "161:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "162:" // Height 4: Partial direct writeback: Done + "b 164f\n" + "163:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x13, #0x40]\n" + "str q13, [x13, #0x50]\n" + "add x13, x13, #0x60\n" + "str q14, [x23, #0x0]\n" + "str q15, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "str q17, [x23, #0x30]\n" + "str q18, [x23, #0x40]\n" + "str q19, [x23, #0x50]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x22, #0x40]\n" + "str q25, [x22, #0x50]\n" + "str q26, [x21, #0x0]\n" + "str q27, [x21, #0x10]\n" + "str q28, [x21, #0x20]\n" + "str q29, [x21, #0x30]\n" + "str q30, [x21, #0x40]\n" + "str q31, [x21, #0x50]\n" + "164:" // Height 4: Writeback done + "subs x16, x16, #0x18\n" + "bgt 125b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 166f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 165f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "165:" // Update direct input + "mov x19, #0x10\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "166:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp new file mode 100644 index 0000000000..37d59cc327 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp @@ -0,0 +1,2595 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_4x24 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 124f\n" + "cmp %x[M], #0x2\n" + "bgt 83f\n" + "beq 42f\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[bias]\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x27, 3f\n" + "ldr q8, [x27, #0x0]\n" + "ldr q9, [x27, #0x10]\n" + "ldr q10, [x27, #0x20]\n" + "ldr q11, [x27, #0x30]\n" + "ldr q12, [x27, #0x40]\n" + "ldr q13, [x27, #0x50]\n" + "add x27, x27, #0x60\n" + "b 18f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 17f\n" + "cmp x9, #0x18\n" + "bge 16f\n" + "tbz x9, #4, 7f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "tbz x9, #2, 5f\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "tbz x9, #1, 4f\n" + "mov x19, #0x58\n" + "ldr d13, [x26], #0x8\n" + "tbz x9, #0, 15f\n" + "ld1 { v13.s }[2], [x26]\n" + "b 15f\n" + "4:" // Height 1: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 15f\n" + "ldr s13, [x26, #0x0]\n" + "b 15f\n" + "5:" // Height 1: Partial accumulate: partial_2_16 + "tbz x9, #1, 6f\n" + "ldr d12, [x26], #0x8\n" + "mov x19, #0x48\n" + "tbz x9, #0, 15f\n" + "ld1 { v12.s }[2], [x26]\n" + "b 15f\n" + "6:" // Height 1: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 15f\n" + "ldr s12, [x26, #0x0]\n" + "b 15f\n" + "7:" // Height 1: Partial accumulate: partial_8_0 + "tbz x9, #3, 11f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "tbz x9, #2, 9f\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "tbz x9, #1, 8f\n" + "mov x19, #0x38\n" + "ldr d11, [x26], #0x8\n" + "tbz x9, #0, 15f\n" + "ld1 { v11.s }[2], [x26]\n" + "b 15f\n" + "8:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 15f\n" + "ldr s11, [x26, #0x0]\n" + "b 15f\n" + "9:" // Height 1: Partial accumulate: partial_2_8 + "tbz x9, #1, 10f\n" + "ldr d10, [x26], #0x8\n" + "mov x19, #0x28\n" + "tbz x9, #0, 15f\n" + "ld1 { v10.s }[2], [x26]\n" + "b 15f\n" + "10:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 15f\n" + "ldr s10, [x26, #0x0]\n" + "b 15f\n" + "11:" // Height 1: Partial accumulate: partial_4_0 + "tbz x9, #2, 13f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "tbz x9, #1, 12f\n" + "ldr d9, [x26], #0x8\n" + "mov x19, #0x18\n" + "tbz x9, #0, 15f\n" + "ld1 { v9.s }[2], [x26]\n" + "b 15f\n" + "12:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 15f\n" + "ldr s9, [x26, #0x0]\n" + "b 15f\n" + "13:" // Height 1: Partial accumulate: partial_2_0 + "tbz x9, #1, 14f\n" + "ldr d8, [x26], #0x8\n" + "mov x19, #0x8\n" + "tbz x9, #0, 15f\n" + "ld1 { v8.s }[2], [x26]\n" + "b 15f\n" + "14:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x26, #0x0]\n" + "mov x19, #0x0\n" + "15:" // Height 1: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 18f\n" + "16:" // Height 1: full accumulate + "ldr q8, [x26, #0x0]\n" + "ldr q9, [x26, #0x10]\n" + "ldr q10, [x26, #0x20]\n" + "ldr q11, [x26, #0x30]\n" + "ldr q12, [x26, #0x40]\n" + "ldr q13, [x26, #0x50]\n" + "b 18f\n" + "17:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "18:" // Height 1: setup done + "mov x25, #0x0\n" + "19:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 21f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "b 21f\n" + "20:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "21:" // Height 1: input setup done + "cmp x24, #0x4\n" + "blt 24f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "cmp x24, #0x8\n" + "blt 23f\n" + "22:" // Height 1: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "sub x24, x24, #0x4\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "cmp x24, #0x8\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "ldr q7, [x28, #0x70]\n" + "ldr q4, [x28, #0x80]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr q5, [x28, #0x90]\n" + "ldr q6, [x28, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "ldr q7, [x28, #0xb0]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "ldr q7, [x28, #0xf0]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "ldr q4, [x28, #0x0]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x23, #0x0]\n" + "bge 22b\n" + "23:" // Height 1: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "ldr q4, [x28, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "ldr q7, [x28, #0x70]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x28, #0x80]\n" + "ldr q5, [x28, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr q6, [x28, #0xa0]\n" + "ldr q7, [x28, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "ldr q7, [x28, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "24:" // Height 1: Multiply loop: Main loop skip + "cbz x24, 26f\n" + "25:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "sub x24, x24, #0x1\n" + "ldr q4, [x28, #0x0]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "add x28, x28, #0x60\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "cbnz x24, 25b\n" + "26:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 19b\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "27:" // Height 1: No activation + "cmp x9, #0x18\n" + "bge 40f\n" + "tbz x9, #4, 31f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v11.4s }, [x26], #0x10\n" + "tbz x9, #2, 29f\n" + "st1 { v12.4s }, [x26], #0x10\n" + "tbz x9, #1, 28f\n" + "str d13, [x26], #0x8\n" + "tbz x9, #0, 39f\n" + "st1 { v13.s }[2], [x26]\n" + "b 39f\n" + "28:" // Height 1: Partial direct writeback: partial_1_20 + "tbz x9, #0, 39f\n" + "str s13, [x26, #0x0]\n" + "b 39f\n" + "29:" // Height 1: Partial direct writeback: partial_2_16 + "tbz x9, #1, 30f\n" + "str d12, [x26], #0x8\n" + "tbz x9, #0, 39f\n" + "st1 { v12.s }[2], [x26]\n" + "b 39f\n" + "30:" // Height 1: Partial direct writeback: partial_1_16 + "tbz x9, #0, 39f\n" + "str s12, [x26, #0x0]\n" + "b 39f\n" + "31:" // Height 1: Partial direct writeback: partial_8_0 + "tbz x9, #3, 35f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "tbz x9, #2, 33f\n" + "st1 { v10.4s }, [x26], #0x10\n" + "tbz x9, #1, 32f\n" + "str d11, [x26], #0x8\n" + "tbz x9, #0, 39f\n" + "st1 { v11.s }[2], [x26]\n" + "b 39f\n" + "32:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x9, #0, 39f\n" + "str s11, [x26, #0x0]\n" + "b 39f\n" + "33:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x9, #1, 34f\n" + "str d10, [x26], #0x8\n" + "tbz x9, #0, 39f\n" + "st1 { v10.s }[2], [x26]\n" + "b 39f\n" + "34:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x9, #0, 39f\n" + "str s10, [x26, #0x0]\n" + "b 39f\n" + "35:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x9, #2, 37f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "tbz x9, #1, 36f\n" + "str d9, [x26], #0x8\n" + "tbz x9, #0, 39f\n" + "st1 { v9.s }[2], [x26]\n" + "b 39f\n" + "36:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x9, #0, 39f\n" + "str s9, [x26, #0x0]\n" + "b 39f\n" + "37:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x9, #1, 38f\n" + "str d8, [x26], #0x8\n" + "tbz x9, #0, 39f\n" + "st1 { v8.s }[2], [x26]\n" + "b 39f\n" + "38:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x26, #0x0]\n" + "39:" // Height 1: Partial direct writeback: Done + "b 41f\n" + "40:" // Height 1: Full writeback + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q12, [x26, #0x40]\n" + "str q13, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "41:" // Height 1: Writeback done + "subs x9, x9, #0x18\n" + "bgt 2b\n" + "b 166f\n" + "42:" // Height 2 + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "43:" // Height 2: Column loop + "cbz x27, 44f\n" + "ldr q8, [x27, #0x0]\n" + "mov v14.16b, v8.16b\n" + "ldr q9, [x27, #0x10]\n" + "ldr q10, [x27, #0x20]\n" + "mov v15.16b, v9.16b\n" + "ldr q11, [x27, #0x30]\n" + "mov v16.16b, v10.16b\n" + "ldr q12, [x27, #0x40]\n" + "ldr q13, [x27, #0x50]\n" + "mov v17.16b, v11.16b\n" + "add x27, x27, #0x60\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v13.16b\n" + "b 59f\n" + "44:" // Height 2: no bias + "tbz %x[flags], #0, 58f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x9, #0x18\n" + "add x22, x26, x19, LSL #2\n" + "bge 57f\n" + "tbz x9, #4, 48f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x9, #2, 46f\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x9, #1, 45f\n" + "mov x19, #0x58\n" + "ldr d13, [x26], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x9, #0, 56f\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 56f\n" + "45:" // Height 2: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 56f\n" + "ldr s13, [x26, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 56f\n" + "46:" // Height 2: Partial accumulate: partial_2_16 + "tbz x9, #1, 47f\n" + "ldr d12, [x26], #0x8\n" + "ldr d18, [x22], #0x8\n" + "mov x19, #0x48\n" + "tbz x9, #0, 56f\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 56f\n" + "47:" // Height 2: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 56f\n" + "ldr s12, [x26, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 56f\n" + "48:" // Height 2: Partial accumulate: partial_8_0 + "tbz x9, #3, 52f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "tbz x9, #2, 50f\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "tbz x9, #1, 49f\n" + "mov x19, #0x38\n" + "ldr d11, [x26], #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x9, #0, 56f\n" + "ld1 { v11.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 56f\n" + "49:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 56f\n" + "ldr s11, [x26, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "b 56f\n" + "50:" // Height 2: Partial accumulate: partial_2_8 + "tbz x9, #1, 51f\n" + "ldr d10, [x26], #0x8\n" + "ldr d16, [x22], #0x8\n" + "mov x19, #0x28\n" + "tbz x9, #0, 56f\n" + "ld1 { v10.s }[2], [x26]\n" + "ld1 { v16.s }[2], [x22]\n" + "b 56f\n" + "51:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 56f\n" + "ldr s10, [x26, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "b 56f\n" + "52:" // Height 2: Partial accumulate: partial_4_0 + "tbz x9, #2, 54f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "tbz x9, #1, 53f\n" + "mov x19, #0x18\n" + "ldr d9, [x26], #0x8\n" + "ldr d15, [x22], #0x8\n" + "tbz x9, #0, 56f\n" + "ld1 { v9.s }[2], [x26]\n" + "ld1 { v15.s }[2], [x22]\n" + "b 56f\n" + "53:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 56f\n" + "ldr s9, [x26, #0x0]\n" + "ldr s15, [x22, #0x0]\n" + "b 56f\n" + "54:" // Height 2: Partial accumulate: partial_2_0 + "tbz x9, #1, 55f\n" + "ldr d8, [x26], #0x8\n" + "ldr d14, [x22], #0x8\n" + "mov x19, #0x8\n" + "tbz x9, #0, 56f\n" + "ld1 { v8.s }[2], [x26]\n" + "ld1 { v14.s }[2], [x22]\n" + "b 56f\n" + "55:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x26, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x22, #0x0]\n" + "56:" // Height 2: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 59f\n" + "57:" // Height 2: full accumulate + "ldr q8, [x26, #0x0]\n" + "ldr q9, [x26, #0x10]\n" + "ldr q10, [x26, #0x20]\n" + "ldr q11, [x26, #0x30]\n" + "ldr q12, [x26, #0x40]\n" + "ldr q13, [x26, #0x50]\n" + "ldr q14, [x22, #0x0]\n" + "ldr q15, [x22, #0x10]\n" + "ldr q16, [x22, #0x20]\n" + "ldr q17, [x22, #0x30]\n" + "ldr q18, [x22, #0x40]\n" + "ldr q19, [x22, #0x50]\n" + "b 59f\n" + "58:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "59:" // Height 2: setup done + "mov x25, #0x0\n" + "60:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 61f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 62f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 62f\n" + "61:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "62:" // Height 2: input setup done + "cmp x24, #0x4\n" + "blt 65f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x8\n" + "ldr q4, [x28, #0x0]\n" + "blt 64f\n" + "63:" // Height 2: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "sub x24, x24, #0x4\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "cmp x24, #0x8\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x28, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "ldr q4, [x28, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "ldr q5, [x28, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x28, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x28, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x28, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "ldr q4, [x28, #0x0]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x23, #0x0]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "ldr q1, [x22, #0x0]\n" + "bge 63b\n" + "64:" // Height 2: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x22, x22, #0x10\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x28, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "ldr q4, [x28, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "ldr q5, [x28, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x28, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x28, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x28, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "65:" // Height 2: Multiply loop: Main loop skip + "cbz x24, 67f\n" + "66:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "sub x24, x24, #0x1\n" + "ldr s1, [x22], #0x4\n" + "ldr q4, [x28, #0x0]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "add x28, x28, #0x60\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "cbnz x24, 66b\n" + "67:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 60b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x22, x26, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 68f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "68:" // Height 2: No activation + "cmp x9, #0x18\n" + "bge 81f\n" + "tbz x9, #4, 72f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v11.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x9, #2, 70f\n" + "st1 { v12.4s }, [x26], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x9, #1, 69f\n" + "str d13, [x26], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x9, #0, 80f\n" + "st1 { v13.s }[2], [x26]\n" + "st1 { v19.s }[2], [x22]\n" + "b 80f\n" + "69:" // Height 2: Partial direct writeback: partial_1_20 + "tbz x9, #0, 80f\n" + "str s13, [x26, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 80f\n" + "70:" // Height 2: Partial direct writeback: partial_2_16 + "tbz x9, #1, 71f\n" + "str d12, [x26], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x9, #0, 80f\n" + "st1 { v12.s }[2], [x26]\n" + "st1 { v18.s }[2], [x22]\n" + "b 80f\n" + "71:" // Height 2: Partial direct writeback: partial_1_16 + "tbz x9, #0, 80f\n" + "str s12, [x26, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 80f\n" + "72:" // Height 2: Partial direct writeback: partial_8_0 + "tbz x9, #3, 76f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "tbz x9, #2, 74f\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x9, #1, 73f\n" + "str d11, [x26], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x9, #0, 80f\n" + "st1 { v11.s }[2], [x26]\n" + "st1 { v17.s }[2], [x22]\n" + "b 80f\n" + "73:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x9, #0, 80f\n" + "str s11, [x26, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 80f\n" + "74:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x9, #1, 75f\n" + "str d10, [x26], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x9, #0, 80f\n" + "st1 { v10.s }[2], [x26]\n" + "st1 { v16.s }[2], [x22]\n" + "b 80f\n" + "75:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x9, #0, 80f\n" + "str s10, [x26, #0x0]\n" + "str s16, [x22, #0x0]\n" + "b 80f\n" + "76:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x9, #2, 78f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "tbz x9, #1, 77f\n" + "str d9, [x26], #0x8\n" + "str d15, [x22], #0x8\n" + "tbz x9, #0, 80f\n" + "st1 { v9.s }[2], [x26]\n" + "st1 { v15.s }[2], [x22]\n" + "b 80f\n" + "77:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x9, #0, 80f\n" + "str s9, [x26, #0x0]\n" + "str s15, [x22, #0x0]\n" + "b 80f\n" + "78:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x9, #1, 79f\n" + "str d8, [x26], #0x8\n" + "str d14, [x22], #0x8\n" + "tbz x9, #0, 80f\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v14.s }[2], [x22]\n" + "b 80f\n" + "79:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x26, #0x0]\n" + "str s14, [x22, #0x0]\n" + "80:" // Height 2: Partial direct writeback: Done + "b 82f\n" + "81:" // Height 2: Full writeback + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q12, [x26, #0x40]\n" + "str q13, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "str q14, [x22, #0x0]\n" + "str q15, [x22, #0x10]\n" + "str q16, [x22, #0x20]\n" + "str q17, [x22, #0x30]\n" + "str q18, [x22, #0x40]\n" + "str q19, [x22, #0x50]\n" + "82:" // Height 2: Writeback done + "subs x9, x9, #0x18\n" + "bgt 43b\n" + "b 166f\n" + "83:" // Height 3 + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "84:" // Height 3: Column loop + "cbz x27, 85f\n" + "ldr q8, [x27, #0x0]\n" + "mov v14.16b, v8.16b\n" + "ldr q9, [x27, #0x10]\n" + "mov v20.16b, v8.16b\n" + "ldr q10, [x27, #0x20]\n" + "ldr q11, [x27, #0x30]\n" + "mov v15.16b, v9.16b\n" + "ldr q12, [x27, #0x40]\n" + "mov v21.16b, v9.16b\n" + "ldr q13, [x27, #0x50]\n" + "add x27, x27, #0x60\n" + "mov v16.16b, v10.16b\n" + "mov v17.16b, v11.16b\n" + "mov v22.16b, v10.16b\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v13.16b\n" + "mov v23.16b, v11.16b\n" + "mov v24.16b, v12.16b\n" + "mov v25.16b, v13.16b\n" + "b 100f\n" + "85:" // Height 3: no bias + "tbz %x[flags], #0, 99f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x9, #0x18\n" + "add x22, x26, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 98f\n" + "tbz x9, #4, 89f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v23.4s }, [x21], #0x10\n" + "tbz x9, #2, 87f\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "tbz x9, #1, 86f\n" + "mov x19, #0x58\n" + "ldr d13, [x26], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "tbz x9, #0, 97f\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "b 97f\n" + "86:" // Height 3: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 97f\n" + "ldr s13, [x26, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "b 97f\n" + "87:" // Height 3: Partial accumulate: partial_2_16 + "tbz x9, #1, 88f\n" + "ldr d12, [x26], #0x8\n" + "ldr d18, [x22], #0x8\n" + "mov x19, #0x48\n" + "ldr d24, [x21], #0x8\n" + "tbz x9, #0, 97f\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "b 97f\n" + "88:" // Height 3: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 97f\n" + "ldr s12, [x26, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "b 97f\n" + "89:" // Height 3: Partial accumulate: partial_8_0 + "tbz x9, #3, 93f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x9, #2, 91f\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x9, #1, 90f\n" + "mov x19, #0x38\n" + "ldr d11, [x26], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "tbz x9, #0, 97f\n" + "ld1 { v11.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 97f\n" + "90:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 97f\n" + "ldr s11, [x26, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 97f\n" + "91:" // Height 3: Partial accumulate: partial_2_8 + "tbz x9, #1, 92f\n" + "ldr d10, [x26], #0x8\n" + "ldr d16, [x22], #0x8\n" + "mov x19, #0x28\n" + "ldr d22, [x21], #0x8\n" + "tbz x9, #0, 97f\n" + "ld1 { v10.s }[2], [x26]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 97f\n" + "92:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 97f\n" + "ldr s10, [x26, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 97f\n" + "93:" // Height 3: Partial accumulate: partial_4_0 + "tbz x9, #2, 95f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "tbz x9, #1, 94f\n" + "mov x19, #0x18\n" + "ldr d9, [x26], #0x8\n" + "ldr d15, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x9, #0, 97f\n" + "ld1 { v9.s }[2], [x26]\n" + "ld1 { v15.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 97f\n" + "94:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 97f\n" + "ldr s9, [x26, #0x0]\n" + "ldr s15, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "b 97f\n" + "95:" // Height 3: Partial accumulate: partial_2_0 + "tbz x9, #1, 96f\n" + "ldr d8, [x26], #0x8\n" + "ldr d14, [x22], #0x8\n" + "mov x19, #0x8\n" + "ldr d20, [x21], #0x8\n" + "tbz x9, #0, 97f\n" + "ld1 { v8.s }[2], [x26]\n" + "ld1 { v14.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "b 97f\n" + "96:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x26, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "97:" // Height 3: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 100f\n" + "98:" // Height 3: full accumulate + "ldr q8, [x26, #0x0]\n" + "ldr q9, [x26, #0x10]\n" + "ldr q10, [x26, #0x20]\n" + "ldr q11, [x26, #0x30]\n" + "ldr q12, [x26, #0x40]\n" + "ldr q13, [x26, #0x50]\n" + "ldr q14, [x22, #0x0]\n" + "ldr q15, [x22, #0x10]\n" + "ldr q16, [x22, #0x20]\n" + "ldr q17, [x22, #0x30]\n" + "ldr q18, [x22, #0x40]\n" + "ldr q19, [x22, #0x50]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x21, #0x40]\n" + "ldr q25, [x21, #0x50]\n" + "b 100f\n" + "99:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "100:" // Height 3: setup done + "mov x25, #0x0\n" + "101:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 102f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 103f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 103f\n" + "102:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "103:" // Height 3: input setup done + "cmp x24, #0x4\n" + "blt 106f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x8\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "blt 105f\n" + "104:" // Height 3: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x21, x21, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "sub x24, x24, #0x4\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x24, #0x8\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x28, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "ldr q4, [x28, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "ldr q5, [x28, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x28, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x28, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x28, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "ldr q4, [x28, #0x0]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x23, #0x0]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "ldr q1, [x22, #0x0]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "ldr q2, [x21, #0x0]\n" + "bge 104b\n" + "105:" // Height 3: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x22, x22, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "add x21, x21, #0x10\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x28, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "ldr q4, [x28, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "ldr q5, [x28, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x28, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x28, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x28, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "106:" // Height 3: Multiply loop: Main loop skip + "cbz x24, 108f\n" + "107:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "sub x24, x24, #0x1\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr q4, [x28, #0x0]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "ldr q4, [x28, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "add x28, x28, #0x60\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "cbnz x24, 107b\n" + "108:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 101b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x22, x26, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 109f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "109:" // Height 3: No activation + "cmp x9, #0x18\n" + "bge 122f\n" + "tbz x9, #4, 113f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v11.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v23.4s }, [x21], #0x10\n" + "tbz x9, #2, 111f\n" + "st1 { v12.4s }, [x26], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "tbz x9, #1, 110f\n" + "str d13, [x26], #0x8\n" + "str d19, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "tbz x9, #0, 121f\n" + "st1 { v13.s }[2], [x26]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "b 121f\n" + "110:" // Height 3: Partial direct writeback: partial_1_20 + "tbz x9, #0, 121f\n" + "str s13, [x26, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "b 121f\n" + "111:" // Height 3: Partial direct writeback: partial_2_16 + "tbz x9, #1, 112f\n" + "str d12, [x26], #0x8\n" + "str d18, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "tbz x9, #0, 121f\n" + "st1 { v12.s }[2], [x26]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "b 121f\n" + "112:" // Height 3: Partial direct writeback: partial_1_16 + "tbz x9, #0, 121f\n" + "str s12, [x26, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "b 121f\n" + "113:" // Height 3: Partial direct writeback: partial_8_0 + "tbz x9, #3, 117f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "tbz x9, #2, 115f\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "tbz x9, #1, 114f\n" + "str d11, [x26], #0x8\n" + "str d17, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "tbz x9, #0, 121f\n" + "st1 { v11.s }[2], [x26]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "b 121f\n" + "114:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x9, #0, 121f\n" + "str s11, [x26, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "b 121f\n" + "115:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x9, #1, 116f\n" + "str d10, [x26], #0x8\n" + "str d16, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz x9, #0, 121f\n" + "st1 { v10.s }[2], [x26]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "b 121f\n" + "116:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x9, #0, 121f\n" + "str s10, [x26, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "b 121f\n" + "117:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x9, #2, 119f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "tbz x9, #1, 118f\n" + "str d9, [x26], #0x8\n" + "str d15, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz x9, #0, 121f\n" + "st1 { v9.s }[2], [x26]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "b 121f\n" + "118:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x9, #0, 121f\n" + "str s9, [x26, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "b 121f\n" + "119:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x9, #1, 120f\n" + "str d8, [x26], #0x8\n" + "str d14, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x9, #0, 121f\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v14.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "b 121f\n" + "120:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x26, #0x0]\n" + "str s14, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "121:" // Height 3: Partial direct writeback: Done + "b 123f\n" + "122:" // Height 3: Full writeback + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q12, [x26, #0x40]\n" + "str q13, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "str q14, [x22, #0x0]\n" + "str q15, [x22, #0x10]\n" + "str q16, [x22, #0x20]\n" + "str q17, [x22, #0x30]\n" + "str q18, [x22, #0x40]\n" + "str q19, [x22, #0x50]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x21, #0x40]\n" + "str q25, [x21, #0x50]\n" + "123:" // Height 3: Writeback done + "subs x9, x9, #0x18\n" + "bgt 84b\n" + "b 166f\n" + "124:" // Height 4 + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x10\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "125:" // Height 4: Column loop + "cbz x27, 126f\n" + "ldr q8, [x27, #0x0]\n" + "mov v14.16b, v8.16b\n" + "ldr q9, [x27, #0x10]\n" + "mov v20.16b, v8.16b\n" + "ldr q10, [x27, #0x20]\n" + "mov v26.16b, v8.16b\n" + "ldr q11, [x27, #0x30]\n" + "ldr q12, [x27, #0x40]\n" + "mov v15.16b, v9.16b\n" + "ldr q13, [x27, #0x50]\n" + "add x27, x27, #0x60\n" + "mov v16.16b, v10.16b\n" + "mov v21.16b, v9.16b\n" + "mov v17.16b, v11.16b\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v13.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v24.16b, v12.16b\n" + "mov v25.16b, v13.16b\n" + "mov v27.16b, v9.16b\n" + "mov v28.16b, v10.16b\n" + "mov v29.16b, v11.16b\n" + "mov v30.16b, v12.16b\n" + "mov v31.16b, v13.16b\n" + "b 141f\n" + "126:" // Height 4: no bias + "tbz %x[flags], #0, 140f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x9, #0x18\n" + "add x22, x26, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 139f\n" + "tbz x9, #4, 130f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v23.4s }, [x21], #0x10\n" + "ld1 { v29.4s }, [x20], #0x10\n" + "tbz x9, #2, 128f\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v30.4s }, [x20], #0x10\n" + "tbz x9, #1, 127f\n" + "mov x19, #0x58\n" + "ldr d13, [x26], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x9, #0, 138f\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "ld1 { v31.s }[2], [x20]\n" + "b 138f\n" + "127:" // Height 4: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 138f\n" + "ldr s13, [x26, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "ldr s31, [x20, #0x0]\n" + "b 138f\n" + "128:" // Height 4: Partial accumulate: partial_2_16 + "tbz x9, #1, 129f\n" + "ldr d12, [x26], #0x8\n" + "ldr d18, [x22], #0x8\n" + "mov x19, #0x48\n" + "ldr d24, [x21], #0x8\n" + "ldr d30, [x20], #0x8\n" + "tbz x9, #0, 138f\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "ld1 { v30.s }[2], [x20]\n" + "b 138f\n" + "129:" // Height 4: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 138f\n" + "ldr s12, [x26, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "ldr s30, [x20, #0x0]\n" + "b 138f\n" + "130:" // Height 4: Partial accumulate: partial_8_0 + "tbz x9, #3, 134f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "tbz x9, #2, 132f\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "tbz x9, #1, 131f\n" + "mov x19, #0x38\n" + "ldr d11, [x26], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d29, [x20], #0x8\n" + "tbz x9, #0, 138f\n" + "ld1 { v11.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v29.s }[2], [x20]\n" + "b 138f\n" + "131:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 138f\n" + "ldr s11, [x26, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s29, [x20, #0x0]\n" + "b 138f\n" + "132:" // Height 4: Partial accumulate: partial_2_8 + "tbz x9, #1, 133f\n" + "ldr d10, [x26], #0x8\n" + "ldr d16, [x22], #0x8\n" + "mov x19, #0x28\n" + "ldr d22, [x21], #0x8\n" + "ldr d28, [x20], #0x8\n" + "tbz x9, #0, 138f\n" + "ld1 { v10.s }[2], [x26]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v28.s }[2], [x20]\n" + "b 138f\n" + "133:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 138f\n" + "ldr s10, [x26, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s28, [x20, #0x0]\n" + "b 138f\n" + "134:" // Height 4: Partial accumulate: partial_4_0 + "tbz x9, #2, 136f\n" + "ld1 { v8.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x9, #1, 135f\n" + "mov x19, #0x18\n" + "ldr d9, [x26], #0x8\n" + "ldr d15, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x9, #0, 138f\n" + "ld1 { v9.s }[2], [x26]\n" + "ld1 { v15.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 138f\n" + "135:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 138f\n" + "ldr s9, [x26, #0x0]\n" + "ldr s15, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 138f\n" + "136:" // Height 4: Partial accumulate: partial_2_0 + "tbz x9, #1, 137f\n" + "ldr d8, [x26], #0x8\n" + "ldr d14, [x22], #0x8\n" + "mov x19, #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x9, #0, 138f\n" + "ld1 { v8.s }[2], [x26]\n" + "ld1 { v14.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 138f\n" + "137:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x26, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "138:" // Height 4: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 141f\n" + "139:" // Height 4: full accumulate + "ldr q8, [x26, #0x0]\n" + "ldr q9, [x26, #0x10]\n" + "ldr q10, [x26, #0x20]\n" + "ldr q11, [x26, #0x30]\n" + "ldr q12, [x26, #0x40]\n" + "ldr q13, [x26, #0x50]\n" + "ldr q14, [x22, #0x0]\n" + "ldr q15, [x22, #0x10]\n" + "ldr q16, [x22, #0x20]\n" + "ldr q17, [x22, #0x30]\n" + "ldr q18, [x22, #0x40]\n" + "ldr q19, [x22, #0x50]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x21, #0x40]\n" + "ldr q25, [x21, #0x50]\n" + "ldr q26, [x20, #0x0]\n" + "ldr q27, [x20, #0x10]\n" + "ldr q28, [x20, #0x20]\n" + "ldr q29, [x20, #0x30]\n" + "ldr q30, [x20, #0x40]\n" + "ldr q31, [x20, #0x50]\n" + "b 141f\n" + "140:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "141:" // Height 4: setup done + "mov x25, #0x0\n" + "142:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 143f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x25, 144f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 144f\n" + "143:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "144:" // Height 4: input setup done + "cmp x24, #0x4\n" + "blt 147f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x8\n" + "ldr q2, [x21, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "blt 146f\n" + "145:" // Height 4: Multiply loop: Main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x21, x21, #0x10\n" + "fmla v26.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "add x20, x20, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x4\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x8\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v27.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "fmla v28.4s, v6.4s, v3.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "fmla v29.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x28, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v30.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x28, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v31.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x28, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "fmla v26.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x28, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "fmla v27.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x28, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "fmla v28.4s, v4.4s, v3.s[1]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "fmla v29.4s, v5.4s, v3.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "fmla v30.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "fmla v31.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x28, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "fmla v26.4s, v4.4s, v3.s[2]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "fmla v27.4s, v5.4s, v3.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "fmla v28.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "fmla v29.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "fmla v30.4s, v4.4s, v3.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "fmla v31.4s, v5.4s, v3.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "fmla v26.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "fmla v27.4s, v7.4s, v3.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "fmla v28.4s, v4.4s, v3.s[3]\n" + "ldr q4, [x28, #0x0]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v29.4s, v5.4s, v3.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v30.4s, v6.4s, v3.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x23, #0x0]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "ldr q1, [x22, #0x0]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "ldr q2, [x21, #0x0]\n" + "fmla v31.4s, v7.4s, v3.s[3]\n" + "ldr q3, [x20, #0x0]\n" + "bge 145b\n" + "146:" // Height 4: Multiply loop: Single iteration only + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x22, x22, #0x10\n" + "fmla v26.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "add x21, x21, #0x10\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x20, x20, #0x10\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v27.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "fmla v28.4s, v6.4s, v3.s[0]\n" + "ldr q6, [x28, #0x60]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "fmla v29.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x28, #0x70]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v30.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x28, #0x80]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v31.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x28, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v20.4s, v6.4s, v2.s[1]\n" + "fmla v26.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x28, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v21.4s, v7.4s, v2.s[1]\n" + "fmla v27.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x28, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v2.s[1]\n" + "fmla v28.4s, v4.4s, v3.s[1]\n" + "ldr q4, [x28, #0xc0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v2.s[1]\n" + "fmla v29.4s, v5.4s, v3.s[1]\n" + "ldr q5, [x28, #0xd0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v2.s[1]\n" + "fmla v30.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x28, #0xe0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v2.s[1]\n" + "fmla v31.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x28, #0xf0]\n" + "fmla v8.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v1.s[2]\n" + "fmla v20.4s, v4.4s, v2.s[2]\n" + "fmla v26.4s, v4.4s, v3.s[2]\n" + "ldr q4, [x28, #0x100]\n" + "fmla v9.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v2.s[2]\n" + "fmla v27.4s, v5.4s, v3.s[2]\n" + "ldr q5, [x28, #0x110]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v6.4s, v1.s[2]\n" + "fmla v22.4s, v6.4s, v2.s[2]\n" + "fmla v28.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x28, #0x120]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v1.s[2]\n" + "fmla v23.4s, v7.4s, v2.s[2]\n" + "fmla v29.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x28, #0x130]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v1.s[2]\n" + "fmla v24.4s, v4.4s, v2.s[2]\n" + "fmla v30.4s, v4.4s, v3.s[2]\n" + "ldr q4, [x28, #0x140]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v1.s[2]\n" + "fmla v25.4s, v5.4s, v2.s[2]\n" + "fmla v31.4s, v5.4s, v3.s[2]\n" + "ldr q5, [x28, #0x150]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v20.4s, v6.4s, v2.s[3]\n" + "fmla v26.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x28, #0x160]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v21.4s, v7.4s, v2.s[3]\n" + "fmla v27.4s, v7.4s, v3.s[3]\n" + "ldr q7, [x28, #0x170]\n" + "add x28, x28, #0x180\n" + "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v22.4s, v4.4s, v2.s[3]\n" + "fmla v28.4s, v4.4s, v3.s[3]\n" + "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v29.4s, v5.4s, v3.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v30.4s, v6.4s, v3.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v25.4s, v7.4s, v2.s[3]\n" + "fmla v31.4s, v7.4s, v3.s[3]\n" + "147:" // Height 4: Multiply loop: Main loop skip + "cbz x24, 149f\n" + "148:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "sub x24, x24, #0x1\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr s3, [x20], #0x4\n" + "ldr q4, [x28, #0x0]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr q5, [x28, #0x10]\n" + "fmla v14.4s, v4.4s, v1.s[0]\n" + "ldr q6, [x28, #0x20]\n" + "fmla v20.4s, v4.4s, v2.s[0]\n" + "ldr q7, [x28, #0x30]\n" + "fmla v26.4s, v4.4s, v3.s[0]\n" + "ldr q4, [x28, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v15.4s, v5.4s, v1.s[0]\n" + "fmla v21.4s, v5.4s, v2.s[0]\n" + "fmla v27.4s, v5.4s, v3.s[0]\n" + "ldr q5, [x28, #0x50]\n" + "add x28, x28, #0x60\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v22.4s, v6.4s, v2.s[0]\n" + "fmla v28.4s, v6.4s, v3.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v2.s[0]\n" + "fmla v29.4s, v7.4s, v3.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v30.4s, v4.4s, v3.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v31.4s, v5.4s, v3.s[0]\n" + "cbnz x24, 148b\n" + "149:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 142b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x22, x26, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbz %x[flags], #1, 150f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "150:" // Height 4: No activation + "cmp x9, #0x18\n" + "bge 163f\n" + "tbz x9, #4, 154f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v11.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v23.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "st1 { v27.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "st1 { v29.4s }, [x20], #0x10\n" + "tbz x9, #2, 152f\n" + "st1 { v12.4s }, [x26], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v30.4s }, [x20], #0x10\n" + "tbz x9, #1, 151f\n" + "str d13, [x26], #0x8\n" + "str d19, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "str d31, [x20], #0x8\n" + "tbz x9, #0, 162f\n" + "st1 { v13.s }[2], [x26]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "st1 { v31.s }[2], [x20]\n" + "b 162f\n" + "151:" // Height 4: Partial direct writeback: partial_1_20 + "tbz x9, #0, 162f\n" + "str s13, [x26, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "str s31, [x20, #0x0]\n" + "b 162f\n" + "152:" // Height 4: Partial direct writeback: partial_2_16 + "tbz x9, #1, 153f\n" + "str d12, [x26], #0x8\n" + "str d18, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "str d30, [x20], #0x8\n" + "tbz x9, #0, 162f\n" + "st1 { v12.s }[2], [x26]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "st1 { v30.s }[2], [x20]\n" + "b 162f\n" + "153:" // Height 4: Partial direct writeback: partial_1_16 + "tbz x9, #0, 162f\n" + "str s12, [x26, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "str s30, [x20, #0x0]\n" + "b 162f\n" + "154:" // Height 4: Partial direct writeback: partial_8_0 + "tbz x9, #3, 158f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "st1 { v27.4s }, [x20], #0x10\n" + "tbz x9, #2, 156f\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "tbz x9, #1, 155f\n" + "str d11, [x26], #0x8\n" + "str d17, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d29, [x20], #0x8\n" + "tbz x9, #0, 162f\n" + "st1 { v11.s }[2], [x26]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v29.s }[2], [x20]\n" + "b 162f\n" + "155:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x9, #0, 162f\n" + "str s11, [x26, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s29, [x20, #0x0]\n" + "b 162f\n" + "156:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x9, #1, 157f\n" + "str d10, [x26], #0x8\n" + "str d16, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d28, [x20], #0x8\n" + "tbz x9, #0, 162f\n" + "st1 { v10.s }[2], [x26]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v28.s }[2], [x20]\n" + "b 162f\n" + "157:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x9, #0, 162f\n" + "str s10, [x26, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s28, [x20, #0x0]\n" + "b 162f\n" + "158:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x9, #2, 160f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x9, #1, 159f\n" + "str d9, [x26], #0x8\n" + "str d15, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x9, #0, 162f\n" + "st1 { v9.s }[2], [x26]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 162f\n" + "159:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x9, #0, 162f\n" + "str s9, [x26, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 162f\n" + "160:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x9, #1, 161f\n" + "str d8, [x26], #0x8\n" + "str d14, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x9, #0, 162f\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v14.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 162f\n" + "161:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x26, #0x0]\n" + "str s14, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "162:" // Height 4: Partial direct writeback: Done + "b 164f\n" + "163:" // Height 4: Full writeback + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q12, [x26, #0x40]\n" + "str q13, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "str q14, [x22, #0x0]\n" + "str q15, [x22, #0x10]\n" + "str q16, [x22, #0x20]\n" + "str q17, [x22, #0x30]\n" + "str q18, [x22, #0x40]\n" + "str q19, [x22, #0x50]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x21, #0x40]\n" + "str q25, [x21, #0x50]\n" + "str q26, [x20, #0x0]\n" + "str q27, [x20, #0x10]\n" + "str q28, [x20, #0x20]\n" + "str q29, [x20, #0x30]\n" + "str q30, [x20, #0x40]\n" + "str q31, [x20, #0x50]\n" + "164:" // Height 4: Writeback done + "subs x9, x9, #0x18\n" + "bgt 125b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 166f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 165f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "165:" // Update direct input + "mov x19, #0x10\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "166:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp index 7f83e617c5..de94e72ab0 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -44,7 +44,8 @@ void a64_hybrid_fp32_mla_6x16_a55( ARGLIST ); class cls_a64_hybrid_fp32_mla_6x16 { public: - typedef float operand_type; + typedef float lhs_operand_type; + typedef float rhs_operand_type; typedef float result_type; typedef void (*kern_type)( ARGLIST ); @@ -70,20 +71,28 @@ public: return true; } - StdTransformsFixed transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 3.04 }; - case CPUModel::A53: - return { 1.43 }; - case CPUModel::A73: - return { 2.56 }; - default: - return { 6.667 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 2.986 }; + case CPUModel::A53: + return { 1.43 }; + case CPUModel::A73: + return { 2.56 }; + default: + return { 6.667 }; + case CPUModel::A510: + return { 3.88 }; + case CPUModel::V1: + return { 13.72 }; + } } + + return { 1.0 }; } // Default to the generic kernel @@ -104,4 +113,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp index 184cfaf95c..e8b7db21bd 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp @@ -839,14 +839,14 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" - "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" - "mov v14.16b, v10.16b\n" - "mov v18.16b, v10.16b\n" "ldr q11, [x16, #0x30]\n" + "mov v13.16b, v9.16b\n" "add x16, x16, #0x40\n" + "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" + "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" "b 80f\n" "69:" // Height 3: no bias @@ -1364,18 +1364,18 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" + "add x16, x16, #0x40\n" "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" "mov v20.16b, v8.16b\n" "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" - "ldr q11, [x16, #0x30]\n" - "add x16, x16, #0x40\n" - "mov v15.16b, v11.16b\n" - "mov v19.16b, v11.16b\n" "mov v23.16b, v11.16b\n" "b 113f\n" "102:" // Height 4: no bias @@ -1996,22 +1996,22 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" + "add x16, x16, #0x40\n" "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" "mov v20.16b, v8.16b\n" "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" "mov v24.16b, v8.16b\n" "mov v25.16b, v9.16b\n" "mov v26.16b, v10.16b\n" - "ldr q11, [x16, #0x30]\n" - "add x16, x16, #0x40\n" - "mov v15.16b, v11.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v11.16b\n" "mov v27.16b, v11.16b\n" "b 146f\n" "135:" // Height 5: no bias @@ -2738,26 +2738,26 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "ldr q9, [x16, #0x10]\n" "ldr q10, [x16, #0x20]\n" "mov v12.16b, v8.16b\n" - "mov v16.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" + "add x16, x16, #0x40\n" "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" "mov v20.16b, v8.16b\n" "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" "mov v24.16b, v8.16b\n" "mov v25.16b, v9.16b\n" "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" "mov v28.16b, v8.16b\n" "mov v29.16b, v9.16b\n" "mov v30.16b, v10.16b\n" - "ldr q11, [x16, #0x30]\n" - "add x16, x16, #0x40\n" - "mov v15.16b, v11.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v11.16b\n" - "mov v27.16b, v11.16b\n" "mov v31.16b, v11.16b\n" "b 179f\n" "168:" // Height 6: no bias diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp index f5504b44d4..28e9be4cb7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp @@ -1893,8 +1893,8 @@ void a64_hybrid_fp32_mla_6x16 ( "ld1 { v22.4s }, [x22], #0x10\n" "ld1 { v26.4s }, [x21], #0x10\n" "tbz x11, #1, 136f\n" - "mov x19, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x19, #0x38\n" "ldr d15, [x24], #0x8\n" "ldr d19, [x23], #0x8\n" "ldr d23, [x22], #0x8\n" @@ -1947,8 +1947,8 @@ void a64_hybrid_fp32_mla_6x16 ( "ld1 { v20.4s }, [x22], #0x10\n" "ld1 { v24.4s }, [x21], #0x10\n" "tbz x11, #1, 140f\n" - "mov x19, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x19, #0x18\n" "ldr d13, [x24], #0x8\n" "ldr d17, [x23], #0x8\n" "ldr d21, [x22], #0x8\n" @@ -2586,12 +2586,12 @@ void a64_hybrid_fp32_mla_6x16 ( "ld1 { v16.4s }, [x23], #0x10\n" "ld1 { v20.4s }, [x22], #0x10\n" "ld1 { v24.4s }, [x21], #0x10\n" - "ld1 { v28.4s }, [x20], #0x10\n" "ld1 { v9.4s }, [x28], #0x10\n" "ld1 { v13.4s }, [x24], #0x10\n" "ld1 { v17.4s }, [x23], #0x10\n" "ld1 { v21.4s }, [x22], #0x10\n" "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" "ld1 { v29.4s }, [x20], #0x10\n" "tbz x11, #2, 170f\n" "ld1 { v10.4s }, [x28], #0x10\n" @@ -2601,8 +2601,8 @@ void a64_hybrid_fp32_mla_6x16 ( "ld1 { v26.4s }, [x21], #0x10\n" "ld1 { v30.4s }, [x20], #0x10\n" "tbz x11, #1, 169f\n" - "mov x19, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x19, #0x38\n" "ldr d15, [x24], #0x8\n" "ldr d19, [x23], #0x8\n" "ldr d23, [x22], #0x8\n" @@ -2662,8 +2662,8 @@ void a64_hybrid_fp32_mla_6x16 ( "ld1 { v24.4s }, [x21], #0x10\n" "ld1 { v28.4s }, [x20], #0x10\n" "tbz x11, #1, 173f\n" - "mov x19, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x19, #0x18\n" "ldr d13, [x24], #0x8\n" "ldr d17, [x23], #0x8\n" "ldr d21, [x22], #0x8\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp index 957754ad68..4fad58a83d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #define ARGLIST \ @@ -43,7 +43,8 @@ void a64_hybrid_fp32_mla_8x4_a55( ARGLIST ); class cls_a64_hybrid_fp32_mla_8x4 { public: - typedef float operand_type; + typedef float lhs_operand_type; + typedef float rhs_operand_type; typedef float result_type; typedef void (*kern_type)( ARGLIST ); @@ -69,7 +70,7 @@ public: return true; } - StdTransformsFixed transforms = {}; + StdTransformsFixed transforms = {}; // Default to the generic kernel kern_type kernel=a64_hybrid_fp32_mla_8x4; @@ -89,4 +90,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp new file mode 100644 index 0000000000..090dd5855e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_fp32bf16fp32_mmla_4x24( ARGLIST ); + +class cls_a64_hybrid_fp32bf16fp32_mmla_4x24 +{ +public: + typedef float lhs_operand_type; + typedef bfloat16 rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 24; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 18.9 }; + case CPUModel::A510: + return { 6.81 }; + case CPUModel::V1: + return { 28.40 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_fp32bf16fp32_mmla_4x24; + cls_a64_hybrid_fp32bf16fp32_mmla_4x24(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp new file mode 100644 index 0000000000..76c2688291 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp @@ -0,0 +1,2426 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_fp32bf16fp32_mmla_4x24 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 130f\n" + "cmp %x[M], #0x2\n" + "bgt 87f\n" + "beq 44f\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[bias]\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x27, 3f\n" + "ldr q8, [x27, #0x0]\n" + "zip2 v14.2d, v8.2d, v8.2d\n" + "ldr q9, [x27, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x27, #0x20]\n" + "ldr q11, [x27, #0x30]\n" + "zip2 v15.2d, v9.2d, v9.2d\n" + "ldr q12, [x27, #0x40]\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "ldr q13, [x27, #0x50]\n" + "add x27, x27, #0x60\n" + "zip2 v16.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v17.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "zip2 v18.2d, v12.2d, v12.2d\n" + "zip1 v12.2d, v12.2d, v12.2d\n" + "zip2 v19.2d, v13.2d, v13.2d\n" + "zip1 v13.2d, v13.2d, v13.2d\n" + "b 19f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 18f\n" + "cmp x9, #0x18\n" + "bge 16f\n" + "tbz x9, #4, 7f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "tbz x9, #2, 5f\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "tbz x9, #1, 4f\n" + "mov x19, #0x58\n" + "ldr d20, [x26], #0x8\n" + "tbz x9, #0, 15f\n" + "ld1 { v20.s }[2], [x26]\n" + "b 15f\n" + "4:" // Height 1: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 15f\n" + "ldr s20, [x26, #0x0]\n" + "b 15f\n" + "5:" // Height 1: Partial accumulate: partial_2_16 + "tbz x9, #1, 6f\n" + "ldr d13, [x26], #0x8\n" + "mov x19, #0x48\n" + "tbz x9, #0, 15f\n" + "ld1 { v13.s }[2], [x26]\n" + "b 15f\n" + "6:" // Height 1: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 15f\n" + "ldr s13, [x26, #0x0]\n" + "b 15f\n" + "7:" // Height 1: Partial accumulate: partial_8_0 + "tbz x9, #3, 11f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "tbz x9, #2, 9f\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "tbz x9, #1, 8f\n" + "mov x19, #0x38\n" + "ldr d12, [x26], #0x8\n" + "tbz x9, #0, 15f\n" + "ld1 { v12.s }[2], [x26]\n" + "b 15f\n" + "8:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 15f\n" + "ldr s12, [x26, #0x0]\n" + "b 15f\n" + "9:" // Height 1: Partial accumulate: partial_2_8 + "tbz x9, #1, 10f\n" + "ldr d11, [x26], #0x8\n" + "mov x19, #0x28\n" + "tbz x9, #0, 15f\n" + "ld1 { v11.s }[2], [x26]\n" + "b 15f\n" + "10:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 15f\n" + "ldr s11, [x26, #0x0]\n" + "b 15f\n" + "11:" // Height 1: Partial accumulate: partial_4_0 + "tbz x9, #2, 13f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "tbz x9, #1, 12f\n" + "ldr d10, [x26], #0x8\n" + "mov x19, #0x18\n" + "tbz x9, #0, 15f\n" + "ld1 { v10.s }[2], [x26]\n" + "b 15f\n" + "12:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 15f\n" + "ldr s10, [x26, #0x0]\n" + "b 15f\n" + "13:" // Height 1: Partial accumulate: partial_2_0 + "tbz x9, #1, 14f\n" + "ldr d9, [x26], #0x8\n" + "mov x19, #0x8\n" + "tbz x9, #0, 15f\n" + "ld1 { v9.s }[2], [x26]\n" + "b 15f\n" + "14:" // Height 1: Partial accumulate: partial_1_0 + "ldr s9, [x26, #0x0]\n" + "mov x19, #0x0\n" + "15:" // Height 1: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 17f\n" + "16:" // Height 1: full accumulate + "ldr q9, [x26, #0x0]\n" + "ldr q10, [x26, #0x10]\n" + "ldr q11, [x26, #0x20]\n" + "ldr q12, [x26, #0x30]\n" + "ldr q13, [x26, #0x40]\n" + "ldr q20, [x26, #0x50]\n" + "17:" // Height 1: MMLA fixup + "zip1 v8.2d, v9.2d, v14.2d\n" + "zip2 v14.2d, v9.2d, v14.2d\n" + "zip1 v9.2d, v10.2d, v15.2d\n" + "zip2 v15.2d, v10.2d, v15.2d\n" + "zip1 v10.2d, v11.2d, v16.2d\n" + "zip2 v16.2d, v11.2d, v16.2d\n" + "zip1 v11.2d, v12.2d, v17.2d\n" + "zip2 v17.2d, v12.2d, v17.2d\n" + "zip1 v12.2d, v13.2d, v18.2d\n" + "zip2 v18.2d, v13.2d, v18.2d\n" + "zip1 v13.2d, v20.2d, v19.2d\n" + "zip2 v19.2d, v20.2d, v19.2d\n" + "b 19f\n" + "18:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "19:" // Height 1: setup done + "mov x25, #0x0\n" + "20:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 21f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 22f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "b 22f\n" + "21:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "22:" // Height 1: input setup done + "cmp x24, #0x4\n" + "blt 25f\n" + "ld1 { v0.4s }, [x23], #0x10\n" + "cmp x24, #0x8\n" + "blt 24f\n" + "23:" // Height 1: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q4, [x28, #0x0]\n" + "sub x24, x24, #0x4\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q5, [x28, #0x10]\n" + "cmp x24, #0x8\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ldr q5, [x28, #0x50]\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "ldr q7, [x28, #0x70]\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + "ldr q5, [x28, #0x90]\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x23], #0x10\n" + "bge 23b\n" + "24:" // Height 1: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q4, [x28, #0x0]\n" + "sub x24, x24, #0x4\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q5, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q4, [x28, #0x40]\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ldr q6, [x28, #0x60]\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + "ldr q4, [x28, #0x80]\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "25:" // Height 1: Multiply loop: Main loop skip + "cbz x24, 28f\n" + "cbz x24, 28f\n" + "tbz x24, #1, 26f\n" + "ldr d0, [x23], #0x8\n" + "tbz x24, #0, 27f\n" + "ld1 { v0.s }[2], [x23]\n" + "b 27f\n" + "26:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x23, #0x0]\n" + "27:" // Height 1: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q4, [x28, #0x0]\n" + "ldr q5, [x28, #0x10]\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q7, [x28, #0x30]\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q5, [x28, #0x50]\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "ldr q7, [x28, #0x70]\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "28:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 20b\n" + "uzp1 v8.2d, v8.2d, v14.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "uzp1 v9.2d, v9.2d, v15.2d\n" + "uzp1 v10.2d, v10.2d, v16.2d\n" + "uzp1 v11.2d, v11.2d, v17.2d\n" + "uzp1 v12.2d, v12.2d, v18.2d\n" + "uzp1 v13.2d, v13.2d, v19.2d\n" + "tbz %x[flags], #1, 29f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "29:" // Height 1: No activation + "cmp x9, #0x18\n" + "bge 42f\n" + "tbz x9, #4, 33f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v11.4s }, [x26], #0x10\n" + "tbz x9, #2, 31f\n" + "st1 { v12.4s }, [x26], #0x10\n" + "tbz x9, #1, 30f\n" + "str d13, [x26], #0x8\n" + "tbz x9, #0, 41f\n" + "st1 { v13.s }[2], [x26]\n" + "b 41f\n" + "30:" // Height 1: Partial direct writeback: partial_1_20 + "tbz x9, #0, 41f\n" + "str s13, [x26, #0x0]\n" + "b 41f\n" + "31:" // Height 1: Partial direct writeback: partial_2_16 + "tbz x9, #1, 32f\n" + "str d12, [x26], #0x8\n" + "tbz x9, #0, 41f\n" + "st1 { v12.s }[2], [x26]\n" + "b 41f\n" + "32:" // Height 1: Partial direct writeback: partial_1_16 + "tbz x9, #0, 41f\n" + "str s12, [x26, #0x0]\n" + "b 41f\n" + "33:" // Height 1: Partial direct writeback: partial_8_0 + "tbz x9, #3, 37f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "tbz x9, #2, 35f\n" + "st1 { v10.4s }, [x26], #0x10\n" + "tbz x9, #1, 34f\n" + "str d11, [x26], #0x8\n" + "tbz x9, #0, 41f\n" + "st1 { v11.s }[2], [x26]\n" + "b 41f\n" + "34:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x9, #0, 41f\n" + "str s11, [x26, #0x0]\n" + "b 41f\n" + "35:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x9, #1, 36f\n" + "str d10, [x26], #0x8\n" + "tbz x9, #0, 41f\n" + "st1 { v10.s }[2], [x26]\n" + "b 41f\n" + "36:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x9, #0, 41f\n" + "str s10, [x26, #0x0]\n" + "b 41f\n" + "37:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x9, #2, 39f\n" + "st1 { v8.4s }, [x26], #0x10\n" + "tbz x9, #1, 38f\n" + "str d9, [x26], #0x8\n" + "tbz x9, #0, 41f\n" + "st1 { v9.s }[2], [x26]\n" + "b 41f\n" + "38:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x9, #0, 41f\n" + "str s9, [x26, #0x0]\n" + "b 41f\n" + "39:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x9, #1, 40f\n" + "str d8, [x26], #0x8\n" + "tbz x9, #0, 41f\n" + "st1 { v8.s }[2], [x26]\n" + "b 41f\n" + "40:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x26, #0x0]\n" + "41:" // Height 1: Partial direct writeback: Done + "b 43f\n" + "42:" // Height 1: Full writeback + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q12, [x26, #0x40]\n" + "str q13, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "43:" // Height 1: Writeback done + "subs x9, x9, #0x18\n" + "bgt 2b\n" + "b 174f\n" + "44:" // Height 2 + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "45:" // Height 2: Column loop + "cbz x27, 46f\n" + "ldr q8, [x27, #0x0]\n" + "zip2 v14.2d, v8.2d, v8.2d\n" + "ldr q9, [x27, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x27, #0x20]\n" + "ldr q11, [x27, #0x30]\n" + "zip2 v15.2d, v9.2d, v9.2d\n" + "ldr q12, [x27, #0x40]\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "ldr q13, [x27, #0x50]\n" + "add x27, x27, #0x60\n" + "zip2 v16.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v17.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "zip2 v18.2d, v12.2d, v12.2d\n" + "zip1 v12.2d, v12.2d, v12.2d\n" + "zip2 v19.2d, v13.2d, v13.2d\n" + "zip1 v13.2d, v13.2d, v13.2d\n" + "b 62f\n" + "46:" // Height 2: no bias + "tbz %x[flags], #0, 61f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x9, #0x18\n" + "add x22, x26, x19, LSL #2\n" + "bge 59f\n" + "tbz x9, #4, 50f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x9, #2, 48f\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x9, #1, 47f\n" + "mov x19, #0x58\n" + "ldr d20, [x26], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x9, #0, 58f\n" + "ld1 { v20.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 58f\n" + "47:" // Height 2: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 58f\n" + "ldr s20, [x26, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 58f\n" + "48:" // Height 2: Partial accumulate: partial_2_16 + "tbz x9, #1, 49f\n" + "ldr d13, [x26], #0x8\n" + "ldr d18, [x22], #0x8\n" + "mov x19, #0x48\n" + "tbz x9, #0, 58f\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 58f\n" + "49:" // Height 2: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 58f\n" + "ldr s13, [x26, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 58f\n" + "50:" // Height 2: Partial accumulate: partial_8_0 + "tbz x9, #3, 54f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "tbz x9, #2, 52f\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "tbz x9, #1, 51f\n" + "mov x19, #0x38\n" + "ldr d12, [x26], #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x9, #0, 58f\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 58f\n" + "51:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 58f\n" + "ldr s12, [x26, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "b 58f\n" + "52:" // Height 2: Partial accumulate: partial_2_8 + "tbz x9, #1, 53f\n" + "ldr d11, [x26], #0x8\n" + "ldr d16, [x22], #0x8\n" + "mov x19, #0x28\n" + "tbz x9, #0, 58f\n" + "ld1 { v11.s }[2], [x26]\n" + "ld1 { v16.s }[2], [x22]\n" + "b 58f\n" + "53:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 58f\n" + "ldr s11, [x26, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "b 58f\n" + "54:" // Height 2: Partial accumulate: partial_4_0 + "tbz x9, #2, 56f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "tbz x9, #1, 55f\n" + "mov x19, #0x18\n" + "ldr d10, [x26], #0x8\n" + "ldr d15, [x22], #0x8\n" + "tbz x9, #0, 58f\n" + "ld1 { v10.s }[2], [x26]\n" + "ld1 { v15.s }[2], [x22]\n" + "b 58f\n" + "55:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 58f\n" + "ldr s10, [x26, #0x0]\n" + "ldr s15, [x22, #0x0]\n" + "b 58f\n" + "56:" // Height 2: Partial accumulate: partial_2_0 + "tbz x9, #1, 57f\n" + "ldr d9, [x26], #0x8\n" + "ldr d14, [x22], #0x8\n" + "mov x19, #0x8\n" + "tbz x9, #0, 58f\n" + "ld1 { v9.s }[2], [x26]\n" + "ld1 { v14.s }[2], [x22]\n" + "b 58f\n" + "57:" // Height 2: Partial accumulate: partial_1_0 + "ldr s9, [x26, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x22, #0x0]\n" + "58:" // Height 2: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 60f\n" + "59:" // Height 2: full accumulate + "ldr q9, [x26, #0x0]\n" + "ldr q10, [x26, #0x10]\n" + "ldr q11, [x26, #0x20]\n" + "ldr q12, [x26, #0x30]\n" + "ldr q13, [x26, #0x40]\n" + "ldr q20, [x26, #0x50]\n" + "ldr q14, [x22, #0x0]\n" + "ldr q15, [x22, #0x10]\n" + "ldr q16, [x22, #0x20]\n" + "ldr q17, [x22, #0x30]\n" + "ldr q18, [x22, #0x40]\n" + "ldr q19, [x22, #0x50]\n" + "60:" // Height 2: MMLA fixup + "zip1 v8.2d, v9.2d, v14.2d\n" + "zip2 v14.2d, v9.2d, v14.2d\n" + "zip1 v9.2d, v10.2d, v15.2d\n" + "zip2 v15.2d, v10.2d, v15.2d\n" + "zip1 v10.2d, v11.2d, v16.2d\n" + "zip2 v16.2d, v11.2d, v16.2d\n" + "zip1 v11.2d, v12.2d, v17.2d\n" + "zip2 v17.2d, v12.2d, v17.2d\n" + "zip1 v12.2d, v13.2d, v18.2d\n" + "zip2 v18.2d, v13.2d, v18.2d\n" + "zip1 v13.2d, v20.2d, v19.2d\n" + "zip2 v19.2d, v20.2d, v19.2d\n" + "b 62f\n" + "61:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "62:" // Height 2: setup done + "mov x25, #0x0\n" + "63:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 64f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 65f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 65f\n" + "64:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "65:" // Height 2: input setup done + "cmp x24, #0x4\n" + "blt 68f\n" + "ld1 { v0.4s }, [x23], #0x10\n" + "cmp x24, #0x8\n" + "blt 67f\n" + "66:" // Height 2: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x22], #0x10\n" + "sub x24, x24, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ldr q4, [x28, #0x0]\n" + "cmp x24, #0x8\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q5, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q4, [x28, #0x40]\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ldr q6, [x28, #0x60]\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + "ldr q4, [x28, #0x80]\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x23], #0x10\n" + "bge 66b\n" + "67:" // Height 2: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x22], #0x10\n" + "sub x24, x24, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ldr q4, [x28, #0x0]\n" + "ldr q5, [x28, #0x10]\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q7, [x28, #0x30]\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q5, [x28, #0x50]\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "ldr q7, [x28, #0x70]\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "68:" // Height 2: Multiply loop: Main loop skip + "cbz x24, 71f\n" + "cbz x24, 71f\n" + "tbz x24, #1, 69f\n" + "ldr d0, [x23], #0x8\n" + "ldr d1, [x22], #0x8\n" + "tbz x24, #0, 70f\n" + "ld1 { v0.s }[2], [x23]\n" + "ld1 { v1.s }[2], [x22]\n" + "b 70f\n" + "69:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x23, #0x0]\n" + "ldr s1, [x22, #0x0]\n" + "70:" // Height 2: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q4, [x28, #0x0]\n" + "ldr q5, [x28, #0x10]\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "71:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 63b\n" + "uzp1 v4.2d, v8.2d, v14.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v14.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x22, x26, x19, LSL #2\n" + "uzp1 v14.2d, v9.2d, v15.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp2 v9.2d, v9.2d, v15.2d\n" + "uzp1 v15.2d, v10.2d, v16.2d\n" + "uzp2 v10.2d, v10.2d, v16.2d\n" + "uzp1 v16.2d, v11.2d, v17.2d\n" + "uzp2 v11.2d, v11.2d, v17.2d\n" + "uzp1 v17.2d, v12.2d, v18.2d\n" + "uzp2 v12.2d, v12.2d, v18.2d\n" + "uzp1 v18.2d, v13.2d, v19.2d\n" + "uzp2 v13.2d, v13.2d, v19.2d\n" + "tbz %x[flags], #1, 72f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v4.4s, v4.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmax v4.4s, v4.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "72:" // Height 2: No activation + "cmp x9, #0x18\n" + "bge 85f\n" + "tbz x9, #4, 76f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v9.4s }, [x22], #0x10\n" + "st1 { v10.4s }, [x22], #0x10\n" + "st1 { v11.4s }, [x22], #0x10\n" + "tbz x9, #2, 74f\n" + "st1 { v17.4s }, [x26], #0x10\n" + "st1 { v12.4s }, [x22], #0x10\n" + "tbz x9, #1, 73f\n" + "str d18, [x26], #0x8\n" + "str d13, [x22], #0x8\n" + "tbz x9, #0, 84f\n" + "st1 { v18.s }[2], [x26]\n" + "st1 { v13.s }[2], [x22]\n" + "b 84f\n" + "73:" // Height 2: Partial direct writeback: partial_1_20 + "tbz x9, #0, 84f\n" + "str s18, [x26, #0x0]\n" + "str s13, [x22, #0x0]\n" + "b 84f\n" + "74:" // Height 2: Partial direct writeback: partial_2_16 + "tbz x9, #1, 75f\n" + "str d17, [x26], #0x8\n" + "str d12, [x22], #0x8\n" + "tbz x9, #0, 84f\n" + "st1 { v17.s }[2], [x26]\n" + "st1 { v12.s }[2], [x22]\n" + "b 84f\n" + "75:" // Height 2: Partial direct writeback: partial_1_16 + "tbz x9, #0, 84f\n" + "str s17, [x26, #0x0]\n" + "str s12, [x22, #0x0]\n" + "b 84f\n" + "76:" // Height 2: Partial direct writeback: partial_8_0 + "tbz x9, #3, 80f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v9.4s }, [x22], #0x10\n" + "tbz x9, #2, 78f\n" + "st1 { v15.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x22], #0x10\n" + "tbz x9, #1, 77f\n" + "str d16, [x26], #0x8\n" + "str d11, [x22], #0x8\n" + "tbz x9, #0, 84f\n" + "st1 { v16.s }[2], [x26]\n" + "st1 { v11.s }[2], [x22]\n" + "b 84f\n" + "77:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x9, #0, 84f\n" + "str s16, [x26, #0x0]\n" + "str s11, [x22, #0x0]\n" + "b 84f\n" + "78:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x9, #1, 79f\n" + "str d15, [x26], #0x8\n" + "str d10, [x22], #0x8\n" + "tbz x9, #0, 84f\n" + "st1 { v15.s }[2], [x26]\n" + "st1 { v10.s }[2], [x22]\n" + "b 84f\n" + "79:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x9, #0, 84f\n" + "str s15, [x26, #0x0]\n" + "str s10, [x22, #0x0]\n" + "b 84f\n" + "80:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x9, #2, 82f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "tbz x9, #1, 81f\n" + "str d14, [x26], #0x8\n" + "str d9, [x22], #0x8\n" + "tbz x9, #0, 84f\n" + "st1 { v14.s }[2], [x26]\n" + "st1 { v9.s }[2], [x22]\n" + "b 84f\n" + "81:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x9, #0, 84f\n" + "str s14, [x26, #0x0]\n" + "str s9, [x22, #0x0]\n" + "b 84f\n" + "82:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x9, #1, 83f\n" + "str d4, [x26], #0x8\n" + "str d8, [x22], #0x8\n" + "tbz x9, #0, 84f\n" + "st1 { v4.s }[2], [x26]\n" + "st1 { v8.s }[2], [x22]\n" + "b 84f\n" + "83:" // Height 2: Partial direct writeback: partial_1_0 + "str s4, [x26, #0x0]\n" + "str s8, [x22, #0x0]\n" + "84:" // Height 2: Partial direct writeback: Done + "b 86f\n" + "85:" // Height 2: Full writeback + "str q4, [x26, #0x0]\n" + "str q14, [x26, #0x10]\n" + "str q15, [x26, #0x20]\n" + "str q16, [x26, #0x30]\n" + "str q17, [x26, #0x40]\n" + "str q18, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "str q8, [x22, #0x0]\n" + "str q9, [x22, #0x10]\n" + "str q10, [x22, #0x20]\n" + "str q11, [x22, #0x30]\n" + "str q12, [x22, #0x40]\n" + "str q13, [x22, #0x50]\n" + "86:" // Height 2: Writeback done + "subs x9, x9, #0x18\n" + "bgt 45b\n" + "b 174f\n" + "87:" // Height 3 + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "88:" // Height 3: Column loop + "cbz x27, 89f\n" + "ldr q8, [x27, #0x0]\n" + "zip2 v14.2d, v8.2d, v8.2d\n" + "ldr q9, [x27, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x27, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x27, #0x30]\n" + "mov v26.16b, v14.16b\n" + "ldr q12, [x27, #0x40]\n" + "ldr q13, [x27, #0x50]\n" + "zip2 v15.2d, v9.2d, v9.2d\n" + "add x27, x27, #0x60\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v16.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v17.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "zip2 v18.2d, v12.2d, v12.2d\n" + "zip1 v12.2d, v12.2d, v12.2d\n" + "zip2 v19.2d, v13.2d, v13.2d\n" + "zip1 v13.2d, v13.2d, v13.2d\n" + "mov v21.16b, v9.16b\n" + "mov v27.16b, v15.16b\n" + "mov v22.16b, v10.16b\n" + "mov v28.16b, v16.16b\n" + "mov v23.16b, v11.16b\n" + "mov v29.16b, v17.16b\n" + "mov v24.16b, v12.16b\n" + "mov v30.16b, v18.16b\n" + "mov v25.16b, v13.16b\n" + "mov v31.16b, v19.16b\n" + "b 105f\n" + "89:" // Height 3: no bias + "tbz %x[flags], #0, 104f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x9, #0x18\n" + "add x22, x26, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 102f\n" + "tbz x9, #4, 93f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v23.4s }, [x21], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "tbz x9, #2, 91f\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "tbz x9, #1, 90f\n" + "mov x19, #0x58\n" + "ldr d20, [x26], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d4, [x21], #0x8\n" + "tbz x9, #0, 101f\n" + "ld1 { v20.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v4.s }[2], [x21]\n" + "b 101f\n" + "90:" // Height 3: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 101f\n" + "ldr s20, [x26, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s4, [x21, #0x0]\n" + "b 101f\n" + "91:" // Height 3: Partial accumulate: partial_2_16 + "tbz x9, #1, 92f\n" + "ldr d13, [x26], #0x8\n" + "ldr d18, [x22], #0x8\n" + "mov x19, #0x48\n" + "ldr d25, [x21], #0x8\n" + "tbz x9, #0, 101f\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "b 101f\n" + "92:" // Height 3: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 101f\n" + "ldr s13, [x26, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "b 101f\n" + "93:" // Height 3: Partial accumulate: partial_8_0 + "tbz x9, #3, 97f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x9, #2, 95f\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v23.4s }, [x21], #0x10\n" + "tbz x9, #1, 94f\n" + "mov x19, #0x38\n" + "ldr d12, [x26], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "tbz x9, #0, 101f\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "b 101f\n" + "94:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 101f\n" + "ldr s12, [x26, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "b 101f\n" + "95:" // Height 3: Partial accumulate: partial_2_8 + "tbz x9, #1, 96f\n" + "ldr d11, [x26], #0x8\n" + "ldr d16, [x22], #0x8\n" + "mov x19, #0x28\n" + "ldr d23, [x21], #0x8\n" + "tbz x9, #0, 101f\n" + "ld1 { v11.s }[2], [x26]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 101f\n" + "96:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 101f\n" + "ldr s11, [x26, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 101f\n" + "97:" // Height 3: Partial accumulate: partial_4_0 + "tbz x9, #2, 99f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x9, #1, 98f\n" + "mov x19, #0x18\n" + "ldr d10, [x26], #0x8\n" + "ldr d15, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "tbz x9, #0, 101f\n" + "ld1 { v10.s }[2], [x26]\n" + "ld1 { v15.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 101f\n" + "98:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 101f\n" + "ldr s10, [x26, #0x0]\n" + "ldr s15, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 101f\n" + "99:" // Height 3: Partial accumulate: partial_2_0 + "tbz x9, #1, 100f\n" + "ldr d9, [x26], #0x8\n" + "ldr d14, [x22], #0x8\n" + "mov x19, #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x9, #0, 101f\n" + "ld1 { v9.s }[2], [x26]\n" + "ld1 { v14.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 101f\n" + "100:" // Height 3: Partial accumulate: partial_1_0 + "ldr s9, [x26, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "101:" // Height 3: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 103f\n" + "102:" // Height 3: full accumulate + "ldr q9, [x26, #0x0]\n" + "ldr q10, [x26, #0x10]\n" + "ldr q11, [x26, #0x20]\n" + "ldr q12, [x26, #0x30]\n" + "ldr q13, [x26, #0x40]\n" + "ldr q20, [x26, #0x50]\n" + "ldr q14, [x22, #0x0]\n" + "ldr q15, [x22, #0x10]\n" + "ldr q16, [x22, #0x20]\n" + "ldr q17, [x22, #0x30]\n" + "ldr q18, [x22, #0x40]\n" + "ldr q19, [x22, #0x50]\n" + "ldr q21, [x21, #0x0]\n" + "ldr q22, [x21, #0x10]\n" + "ldr q23, [x21, #0x20]\n" + "ldr q24, [x21, #0x30]\n" + "ldr q25, [x21, #0x40]\n" + "ldr q4, [x21, #0x50]\n" + "103:" // Height 3: MMLA fixup + "zip1 v8.2d, v9.2d, v14.2d\n" + "zip2 v14.2d, v9.2d, v14.2d\n" + "zip1 v9.2d, v10.2d, v15.2d\n" + "zip2 v15.2d, v10.2d, v15.2d\n" + "zip1 v10.2d, v11.2d, v16.2d\n" + "zip2 v16.2d, v11.2d, v16.2d\n" + "zip1 v11.2d, v12.2d, v17.2d\n" + "zip2 v17.2d, v12.2d, v17.2d\n" + "zip1 v12.2d, v13.2d, v18.2d\n" + "zip2 v18.2d, v13.2d, v18.2d\n" + "zip1 v13.2d, v20.2d, v19.2d\n" + "zip2 v19.2d, v20.2d, v19.2d\n" + "zip1 v20.2d, v21.2d, v26.2d\n" + "zip2 v26.2d, v21.2d, v26.2d\n" + "zip1 v21.2d, v22.2d, v27.2d\n" + "zip2 v27.2d, v22.2d, v27.2d\n" + "zip1 v22.2d, v23.2d, v28.2d\n" + "zip2 v28.2d, v23.2d, v28.2d\n" + "zip1 v23.2d, v24.2d, v29.2d\n" + "zip2 v29.2d, v24.2d, v29.2d\n" + "zip1 v24.2d, v25.2d, v30.2d\n" + "zip2 v30.2d, v25.2d, v30.2d\n" + "zip1 v25.2d, v4.2d, v31.2d\n" + "zip2 v31.2d, v4.2d, v31.2d\n" + "b 105f\n" + "104:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "105:" // Height 3: setup done + "mov x25, #0x0\n" + "106:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 107f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 108f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 108f\n" + "107:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "108:" // Height 3: input setup done + "cmp x24, #0x4\n" + "blt 111f\n" + "ld1 { v0.4s }, [x23], #0x10\n" + "cmp x24, #0x8\n" + "blt 110f\n" + "109:" // Height 3: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x22], #0x10\n" + "sub x24, x24, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x21], #0x10\n" + "cmp x24, #0x8\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q4, [x28, #0x0]\n" + "ldr q5, [x28, #0x10]\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x50]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x23], #0x10\n" + ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + "bge 109b\n" + "110:" // Height 3: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x22], #0x10\n" + "sub x24, x24, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x21], #0x10\n" + "ldr q4, [x28, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q5, [x28, #0x10]\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x40]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + "111:" // Height 3: Multiply loop: Main loop skip + "cbz x24, 114f\n" + "cbz x24, 114f\n" + "tbz x24, #1, 112f\n" + "ldr d0, [x23], #0x8\n" + "ldr d1, [x22], #0x8\n" + "ldr d2, [x21], #0x8\n" + "tbz x24, #0, 113f\n" + "ld1 { v0.s }[2], [x23]\n" + "ld1 { v1.s }[2], [x22]\n" + "ld1 { v2.s }[2], [x21]\n" + "b 113f\n" + "112:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x23, #0x0]\n" + "ldr s1, [x22, #0x0]\n" + "ldr s2, [x21, #0x0]\n" + "113:" // Height 3: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q4, [x28, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q5, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + "114:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 106b\n" + "uzp1 v4.2d, v8.2d, v14.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v14.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x22, x26, x19, LSL #2\n" + "uzp1 v14.2d, v9.2d, v15.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp2 v9.2d, v9.2d, v15.2d\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v15.2d, v10.2d, v16.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v10.2d, v10.2d, v16.2d\n" + "uzp1 v16.2d, v11.2d, v17.2d\n" + "uzp2 v11.2d, v11.2d, v17.2d\n" + "uzp1 v17.2d, v12.2d, v18.2d\n" + "uzp2 v12.2d, v12.2d, v18.2d\n" + "uzp1 v18.2d, v13.2d, v19.2d\n" + "uzp2 v13.2d, v13.2d, v19.2d\n" + "uzp1 v20.2d, v20.2d, v26.2d\n" + "uzp1 v21.2d, v21.2d, v27.2d\n" + "uzp1 v22.2d, v22.2d, v28.2d\n" + "uzp1 v23.2d, v23.2d, v29.2d\n" + "uzp1 v24.2d, v24.2d, v30.2d\n" + "uzp1 v25.2d, v25.2d, v31.2d\n" + "tbz %x[flags], #1, 115f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v4.4s, v4.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmax v4.4s, v4.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "115:" // Height 3: No activation + "cmp x9, #0x18\n" + "bge 128f\n" + "tbz x9, #4, 119f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v9.4s }, [x22], #0x10\n" + "st1 { v10.4s }, [x22], #0x10\n" + "st1 { v11.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v23.4s }, [x21], #0x10\n" + "tbz x9, #2, 117f\n" + "st1 { v17.4s }, [x26], #0x10\n" + "st1 { v12.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "tbz x9, #1, 116f\n" + "str d18, [x26], #0x8\n" + "str d13, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "tbz x9, #0, 127f\n" + "st1 { v18.s }[2], [x26]\n" + "st1 { v13.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "b 127f\n" + "116:" // Height 3: Partial direct writeback: partial_1_20 + "tbz x9, #0, 127f\n" + "str s18, [x26, #0x0]\n" + "str s13, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "b 127f\n" + "117:" // Height 3: Partial direct writeback: partial_2_16 + "tbz x9, #1, 118f\n" + "str d17, [x26], #0x8\n" + "str d12, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "tbz x9, #0, 127f\n" + "st1 { v17.s }[2], [x26]\n" + "st1 { v12.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "b 127f\n" + "118:" // Height 3: Partial direct writeback: partial_1_16 + "tbz x9, #0, 127f\n" + "str s17, [x26, #0x0]\n" + "str s12, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "b 127f\n" + "119:" // Height 3: Partial direct writeback: partial_8_0 + "tbz x9, #3, 123f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v9.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "tbz x9, #2, 121f\n" + "st1 { v15.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "tbz x9, #1, 120f\n" + "str d16, [x26], #0x8\n" + "str d11, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "tbz x9, #0, 127f\n" + "st1 { v16.s }[2], [x26]\n" + "st1 { v11.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "b 127f\n" + "120:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x9, #0, 127f\n" + "str s16, [x26, #0x0]\n" + "str s11, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "b 127f\n" + "121:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x9, #1, 122f\n" + "str d15, [x26], #0x8\n" + "str d10, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz x9, #0, 127f\n" + "st1 { v15.s }[2], [x26]\n" + "st1 { v10.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "b 127f\n" + "122:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x9, #0, 127f\n" + "str s15, [x26, #0x0]\n" + "str s10, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "b 127f\n" + "123:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x9, #2, 125f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "tbz x9, #1, 124f\n" + "str d14, [x26], #0x8\n" + "str d9, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz x9, #0, 127f\n" + "st1 { v14.s }[2], [x26]\n" + "st1 { v9.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "b 127f\n" + "124:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x9, #0, 127f\n" + "str s14, [x26, #0x0]\n" + "str s9, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "b 127f\n" + "125:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x9, #1, 126f\n" + "str d4, [x26], #0x8\n" + "str d8, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x9, #0, 127f\n" + "st1 { v4.s }[2], [x26]\n" + "st1 { v8.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "b 127f\n" + "126:" // Height 3: Partial direct writeback: partial_1_0 + "str s4, [x26, #0x0]\n" + "str s8, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "127:" // Height 3: Partial direct writeback: Done + "b 129f\n" + "128:" // Height 3: Full writeback + "str q4, [x26, #0x0]\n" + "str q14, [x26, #0x10]\n" + "str q15, [x26, #0x20]\n" + "str q16, [x26, #0x30]\n" + "str q17, [x26, #0x40]\n" + "str q18, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "str q8, [x22, #0x0]\n" + "str q9, [x22, #0x10]\n" + "str q10, [x22, #0x20]\n" + "str q11, [x22, #0x30]\n" + "str q12, [x22, #0x40]\n" + "str q13, [x22, #0x50]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x21, #0x40]\n" + "str q25, [x21, #0x50]\n" + "129:" // Height 3: Writeback done + "subs x9, x9, #0x18\n" + "bgt 88b\n" + "b 174f\n" + "130:" // Height 4 + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x10\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "131:" // Height 4: Column loop + "cbz x27, 132f\n" + "ldr q8, [x27, #0x0]\n" + "zip2 v14.2d, v8.2d, v8.2d\n" + "ldr q9, [x27, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x27, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x27, #0x30]\n" + "mov v26.16b, v14.16b\n" + "ldr q12, [x27, #0x40]\n" + "ldr q13, [x27, #0x50]\n" + "zip2 v15.2d, v9.2d, v9.2d\n" + "add x27, x27, #0x60\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v16.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v17.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "zip2 v18.2d, v12.2d, v12.2d\n" + "zip1 v12.2d, v12.2d, v12.2d\n" + "zip2 v19.2d, v13.2d, v13.2d\n" + "zip1 v13.2d, v13.2d, v13.2d\n" + "mov v21.16b, v9.16b\n" + "mov v27.16b, v15.16b\n" + "mov v22.16b, v10.16b\n" + "mov v28.16b, v16.16b\n" + "mov v23.16b, v11.16b\n" + "mov v29.16b, v17.16b\n" + "mov v24.16b, v12.16b\n" + "mov v30.16b, v18.16b\n" + "mov v25.16b, v13.16b\n" + "mov v31.16b, v19.16b\n" + "b 148f\n" + "132:" // Height 4: no bias + "tbz %x[flags], #0, 147f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x9, #0x18\n" + "add x22, x26, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 145f\n" + "tbz x9, #4, 136f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v23.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v29.4s }, [x20], #0x10\n" + "tbz x9, #2, 134f\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v30.4s }, [x20], #0x10\n" + "tbz x9, #1, 133f\n" + "mov x19, #0x58\n" + "ldr d20, [x26], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d4, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x9, #0, 144f\n" + "ld1 { v20.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v4.s }[2], [x21]\n" + "ld1 { v31.s }[2], [x20]\n" + "b 144f\n" + "133:" // Height 4: Partial accumulate: partial_1_20 + "mov x19, #0x50\n" + "tbz x9, #0, 144f\n" + "ldr s20, [x26, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s4, [x21, #0x0]\n" + "ldr s31, [x20, #0x0]\n" + "b 144f\n" + "134:" // Height 4: Partial accumulate: partial_2_16 + "tbz x9, #1, 135f\n" + "ldr d13, [x26], #0x8\n" + "ldr d18, [x22], #0x8\n" + "mov x19, #0x48\n" + "ldr d25, [x21], #0x8\n" + "ldr d30, [x20], #0x8\n" + "tbz x9, #0, 144f\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "ld1 { v30.s }[2], [x20]\n" + "b 144f\n" + "135:" // Height 4: Partial accumulate: partial_1_16 + "mov x19, #0x40\n" + "tbz x9, #0, 144f\n" + "ldr s13, [x26, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "ldr s30, [x20, #0x0]\n" + "b 144f\n" + "136:" // Height 4: Partial accumulate: partial_8_0 + "tbz x9, #3, 140f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v10.4s }, [x26], #0x10\n" + "ld1 { v15.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "tbz x9, #2, 138f\n" + "ld1 { v11.4s }, [x26], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v23.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "tbz x9, #1, 137f\n" + "mov x19, #0x38\n" + "ldr d12, [x26], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d29, [x20], #0x8\n" + "tbz x9, #0, 144f\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "ld1 { v29.s }[2], [x20]\n" + "b 144f\n" + "137:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x9, #0, 144f\n" + "ldr s12, [x26, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "ldr s29, [x20, #0x0]\n" + "b 144f\n" + "138:" // Height 4: Partial accumulate: partial_2_8 + "tbz x9, #1, 139f\n" + "ldr d11, [x26], #0x8\n" + "ldr d16, [x22], #0x8\n" + "mov x19, #0x28\n" + "ldr d23, [x21], #0x8\n" + "ldr d28, [x20], #0x8\n" + "tbz x9, #0, 144f\n" + "ld1 { v11.s }[2], [x26]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v28.s }[2], [x20]\n" + "b 144f\n" + "139:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x9, #0, 144f\n" + "ldr s11, [x26, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s28, [x20, #0x0]\n" + "b 144f\n" + "140:" // Height 4: Partial accumulate: partial_4_0 + "tbz x9, #2, 142f\n" + "ld1 { v9.4s }, [x26], #0x10\n" + "ld1 { v14.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x9, #1, 141f\n" + "mov x19, #0x18\n" + "ldr d10, [x26], #0x8\n" + "ldr d15, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x9, #0, 144f\n" + "ld1 { v10.s }[2], [x26]\n" + "ld1 { v15.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 144f\n" + "141:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x9, #0, 144f\n" + "ldr s10, [x26, #0x0]\n" + "ldr s15, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 144f\n" + "142:" // Height 4: Partial accumulate: partial_2_0 + "tbz x9, #1, 143f\n" + "ldr d9, [x26], #0x8\n" + "ldr d14, [x22], #0x8\n" + "mov x19, #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x9, #0, 144f\n" + "ld1 { v9.s }[2], [x26]\n" + "ld1 { v14.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 144f\n" + "143:" // Height 4: Partial accumulate: partial_1_0 + "ldr s9, [x26, #0x0]\n" + "mov x19, #0x0\n" + "ldr s14, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "144:" // Height 4: Partial accumulate: Done + "sub x26, x26, x19\n" + "b 146f\n" + "145:" // Height 4: full accumulate + "ldr q9, [x26, #0x0]\n" + "ldr q10, [x26, #0x10]\n" + "ldr q11, [x26, #0x20]\n" + "ldr q12, [x26, #0x30]\n" + "ldr q13, [x26, #0x40]\n" + "ldr q20, [x26, #0x50]\n" + "ldr q14, [x22, #0x0]\n" + "ldr q15, [x22, #0x10]\n" + "ldr q16, [x22, #0x20]\n" + "ldr q17, [x22, #0x30]\n" + "ldr q18, [x22, #0x40]\n" + "ldr q19, [x22, #0x50]\n" + "ldr q21, [x21, #0x0]\n" + "ldr q22, [x21, #0x10]\n" + "ldr q23, [x21, #0x20]\n" + "ldr q24, [x21, #0x30]\n" + "ldr q25, [x21, #0x40]\n" + "ldr q4, [x21, #0x50]\n" + "ldr q26, [x20, #0x0]\n" + "ldr q27, [x20, #0x10]\n" + "ldr q28, [x20, #0x20]\n" + "ldr q29, [x20, #0x30]\n" + "ldr q30, [x20, #0x40]\n" + "ldr q31, [x20, #0x50]\n" + "146:" // Height 4: MMLA fixup + "zip1 v8.2d, v9.2d, v14.2d\n" + "zip2 v14.2d, v9.2d, v14.2d\n" + "zip1 v9.2d, v10.2d, v15.2d\n" + "zip2 v15.2d, v10.2d, v15.2d\n" + "zip1 v10.2d, v11.2d, v16.2d\n" + "zip2 v16.2d, v11.2d, v16.2d\n" + "zip1 v11.2d, v12.2d, v17.2d\n" + "zip2 v17.2d, v12.2d, v17.2d\n" + "zip1 v12.2d, v13.2d, v18.2d\n" + "zip2 v18.2d, v13.2d, v18.2d\n" + "zip1 v13.2d, v20.2d, v19.2d\n" + "zip2 v19.2d, v20.2d, v19.2d\n" + "zip1 v20.2d, v21.2d, v26.2d\n" + "zip2 v26.2d, v21.2d, v26.2d\n" + "zip1 v21.2d, v22.2d, v27.2d\n" + "zip2 v27.2d, v22.2d, v27.2d\n" + "zip1 v22.2d, v23.2d, v28.2d\n" + "zip2 v28.2d, v23.2d, v28.2d\n" + "zip1 v23.2d, v24.2d, v29.2d\n" + "zip2 v29.2d, v24.2d, v29.2d\n" + "zip1 v24.2d, v25.2d, v30.2d\n" + "zip2 v30.2d, v25.2d, v30.2d\n" + "zip1 v25.2d, v4.2d, v31.2d\n" + "zip2 v31.2d, v4.2d, v31.2d\n" + "b 148f\n" + "147:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "148:" // Height 4: setup done + "mov x25, #0x0\n" + "149:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 150f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x25, 151f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 151f\n" + "150:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "151:" // Height 4: input setup done + "cmp x24, #0x4\n" + "blt 154f\n" + "ld1 { v0.4s }, [x23], #0x10\n" + "cmp x24, #0x8\n" + "blt 153f\n" + "152:" // Height 4: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x22], #0x10\n" + "sub x24, x24, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x21], #0x10\n" + "cmp x24, #0x8\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ld1 { v3.4s }, [x20], #0x10\n" + "ldr q4, [x28, #0x0]\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "ldr q5, [x28, #0x10]\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x40]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x23], #0x10\n" + ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + "bge 152b\n" + "153:" // Height 4: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x22], #0x10\n" + "sub x24, x24, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x21], #0x10\n" + "ld1 { v3.4s }, [x20], #0x10\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q4, [x28, #0x0]\n" + "ldr q5, [x28, #0x10]\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + "154:" // Height 4: Multiply loop: Main loop skip + "cbz x24, 157f\n" + "cbz x24, 157f\n" + "tbz x24, #1, 155f\n" + "ldr d0, [x23], #0x8\n" + "ldr d1, [x22], #0x8\n" + "ldr d2, [x21], #0x8\n" + "ldr d3, [x20], #0x8\n" + "tbz x24, #0, 156f\n" + "ld1 { v0.s }[2], [x23]\n" + "ld1 { v1.s }[2], [x22]\n" + "ld1 { v2.s }[2], [x21]\n" + "ld1 { v3.s }[2], [x20]\n" + "b 156f\n" + "155:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x23, #0x0]\n" + "ldr s1, [x22, #0x0]\n" + "ldr s2, [x21, #0x0]\n" + "ldr s3, [x20, #0x0]\n" + "156:" // Height 4: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q4, [x28, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q5, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x40]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x50]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0x70]\n" + ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x80]\n" + ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x90]\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x28, #0xa0]\n" + ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" + "ldr q7, [x28, #0xb0]\n" + "add x28, x28, #0xc0\n" + ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + "157:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 149b\n" + "uzp1 v4.2d, v8.2d, v14.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v14.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x22, x26, x19, LSL #2\n" + "uzp1 v14.2d, v9.2d, v15.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp2 v9.2d, v9.2d, v15.2d\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v15.2d, v10.2d, v16.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v16.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "uzp1 v16.2d, v11.2d, v17.2d\n" + "uzp2 v11.2d, v11.2d, v17.2d\n" + "uzp1 v17.2d, v12.2d, v18.2d\n" + "uzp2 v12.2d, v12.2d, v18.2d\n" + "uzp1 v18.2d, v13.2d, v19.2d\n" + "uzp2 v13.2d, v13.2d, v19.2d\n" + "uzp1 v19.2d, v20.2d, v26.2d\n" + "uzp2 v20.2d, v20.2d, v26.2d\n" + "uzp1 v26.2d, v21.2d, v27.2d\n" + "uzp2 v21.2d, v21.2d, v27.2d\n" + "uzp1 v27.2d, v22.2d, v28.2d\n" + "uzp2 v22.2d, v22.2d, v28.2d\n" + "uzp1 v28.2d, v23.2d, v29.2d\n" + "uzp2 v23.2d, v23.2d, v29.2d\n" + "uzp1 v29.2d, v24.2d, v30.2d\n" + "uzp2 v24.2d, v24.2d, v30.2d\n" + "uzp1 v30.2d, v25.2d, v31.2d\n" + "uzp2 v25.2d, v25.2d, v31.2d\n" + "tbz %x[flags], #1, 158f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v4.4s, v4.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmax v4.4s, v4.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "158:" // Height 4: No activation + "cmp x9, #0x18\n" + "bge 171f\n" + "tbz x9, #4, 162f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v9.4s }, [x22], #0x10\n" + "st1 { v10.4s }, [x22], #0x10\n" + "st1 { v11.4s }, [x22], #0x10\n" + "st1 { v19.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "st1 { v27.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v20.4s }, [x20], #0x10\n" + "st1 { v21.4s }, [x20], #0x10\n" + "st1 { v22.4s }, [x20], #0x10\n" + "st1 { v23.4s }, [x20], #0x10\n" + "tbz x9, #2, 160f\n" + "st1 { v17.4s }, [x26], #0x10\n" + "st1 { v12.4s }, [x22], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x9, #1, 159f\n" + "str d18, [x26], #0x8\n" + "str d13, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x9, #0, 170f\n" + "st1 { v18.s }[2], [x26]\n" + "st1 { v13.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 170f\n" + "159:" // Height 4: Partial direct writeback: partial_1_20 + "tbz x9, #0, 170f\n" + "str s18, [x26, #0x0]\n" + "str s13, [x22, #0x0]\n" + "str s30, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 170f\n" + "160:" // Height 4: Partial direct writeback: partial_2_16 + "tbz x9, #1, 161f\n" + "str d17, [x26], #0x8\n" + "str d12, [x22], #0x8\n" + "str d29, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x9, #0, 170f\n" + "st1 { v17.s }[2], [x26]\n" + "st1 { v12.s }[2], [x22]\n" + "st1 { v29.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 170f\n" + "161:" // Height 4: Partial direct writeback: partial_1_16 + "tbz x9, #0, 170f\n" + "str s17, [x26, #0x0]\n" + "str s12, [x22, #0x0]\n" + "str s29, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "b 170f\n" + "162:" // Height 4: Partial direct writeback: partial_8_0 + "tbz x9, #3, 166f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v14.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v9.4s }, [x22], #0x10\n" + "st1 { v19.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "st1 { v20.4s }, [x20], #0x10\n" + "st1 { v21.4s }, [x20], #0x10\n" + "tbz x9, #2, 164f\n" + "st1 { v15.4s }, [x26], #0x10\n" + "st1 { v10.4s }, [x22], #0x10\n" + "st1 { v27.4s }, [x21], #0x10\n" + "st1 { v22.4s }, [x20], #0x10\n" + "tbz x9, #1, 163f\n" + "str d16, [x26], #0x8\n" + "str d11, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "str d23, [x20], #0x8\n" + "tbz x9, #0, 170f\n" + "st1 { v16.s }[2], [x26]\n" + "st1 { v11.s }[2], [x22]\n" + "st1 { v28.s }[2], [x21]\n" + "st1 { v23.s }[2], [x20]\n" + "b 170f\n" + "163:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x9, #0, 170f\n" + "str s16, [x26, #0x0]\n" + "str s11, [x22, #0x0]\n" + "str s28, [x21, #0x0]\n" + "str s23, [x20, #0x0]\n" + "b 170f\n" + "164:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x9, #1, 165f\n" + "str d15, [x26], #0x8\n" + "str d10, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "str d22, [x20], #0x8\n" + "tbz x9, #0, 170f\n" + "st1 { v15.s }[2], [x26]\n" + "st1 { v10.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "st1 { v22.s }[2], [x20]\n" + "b 170f\n" + "165:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x9, #0, 170f\n" + "str s15, [x26, #0x0]\n" + "str s10, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "str s22, [x20, #0x0]\n" + "b 170f\n" + "166:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x9, #2, 168f\n" + "st1 { v4.4s }, [x26], #0x10\n" + "st1 { v8.4s }, [x22], #0x10\n" + "st1 { v19.4s }, [x21], #0x10\n" + "st1 { v20.4s }, [x20], #0x10\n" + "tbz x9, #1, 167f\n" + "str d14, [x26], #0x8\n" + "str d9, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "str d21, [x20], #0x8\n" + "tbz x9, #0, 170f\n" + "st1 { v14.s }[2], [x26]\n" + "st1 { v9.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "st1 { v21.s }[2], [x20]\n" + "b 170f\n" + "167:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x9, #0, 170f\n" + "str s14, [x26, #0x0]\n" + "str s9, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "str s21, [x20, #0x0]\n" + "b 170f\n" + "168:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x9, #1, 169f\n" + "str d4, [x26], #0x8\n" + "str d8, [x22], #0x8\n" + "str d19, [x21], #0x8\n" + "str d20, [x20], #0x8\n" + "tbz x9, #0, 170f\n" + "st1 { v4.s }[2], [x26]\n" + "st1 { v8.s }[2], [x22]\n" + "st1 { v19.s }[2], [x21]\n" + "st1 { v20.s }[2], [x20]\n" + "b 170f\n" + "169:" // Height 4: Partial direct writeback: partial_1_0 + "str s4, [x26, #0x0]\n" + "str s8, [x22, #0x0]\n" + "str s19, [x21, #0x0]\n" + "str s20, [x20, #0x0]\n" + "170:" // Height 4: Partial direct writeback: Done + "b 172f\n" + "171:" // Height 4: Full writeback + "str q4, [x26, #0x0]\n" + "str q14, [x26, #0x10]\n" + "str q15, [x26, #0x20]\n" + "str q16, [x26, #0x30]\n" + "str q17, [x26, #0x40]\n" + "str q18, [x26, #0x50]\n" + "add x26, x26, #0x60\n" + "str q8, [x22, #0x0]\n" + "str q9, [x22, #0x10]\n" + "str q10, [x22, #0x20]\n" + "str q11, [x22, #0x30]\n" + "str q12, [x22, #0x40]\n" + "str q13, [x22, #0x50]\n" + "str q19, [x21, #0x0]\n" + "str q26, [x21, #0x10]\n" + "str q27, [x21, #0x20]\n" + "str q28, [x21, #0x30]\n" + "str q29, [x21, #0x40]\n" + "str q30, [x21, #0x50]\n" + "str q20, [x20, #0x0]\n" + "str q21, [x20, #0x10]\n" + "str q22, [x20, #0x20]\n" + "str q23, [x20, #0x30]\n" + "str q24, [x20, #0x40]\n" + "str q25, [x20, #0x50]\n" + "172:" // Height 4: Writeback done + "subs x9, x9, #0x18\n" + "bgt 131b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 174f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 173f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "173:" // Update direct input + "mov x19, #0x10\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "174:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp new file mode 100644 index 0000000000..f5e9009f6d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_fp32bf16fp32_mmla_6x16( ARGLIST ); + +class cls_a64_hybrid_fp32bf16fp32_mmla_6x16 +{ +public: + typedef float lhs_operand_type; + typedef bfloat16 rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 16.37 }; + case CPUModel::A510: + return { 6.70 }; + case CPUModel::V1: + return { 26.64 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_fp32bf16fp32_mmla_6x16; + cls_a64_hybrid_fp32bf16fp32_mmla_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp new file mode 100644 index 0000000000..19dbf0588e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp @@ -0,0 +1,3137 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_fp32bf16fp32_mmla_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 176f\n" + "cmp %x[M], #0x4\n" + "bgt 141f\n" + "beq 106f\n" + "cmp %x[M], #0x2\n" + "bgt 71f\n" + "beq 36f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x9, 3f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "add x9, x9, #0x40\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "b 15f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 14f\n" + "cmp x11, #0x10\n" + "bge 12f\n" + "tbz x11, #3, 7f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #2, 5f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "tbz x11, #1, 4f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v16.s }[2], [x28]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 11f\n" + "ldr s16, [x28, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 6f\n" + "ldr d11, [x28], #0x8\n" + "mov x19, #0x28\n" + "tbz x11, #0, 11f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 11f\n" + "ldr s11, [x28, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 9f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #1, 8f\n" + "ldr d10, [x28], #0x8\n" + "mov x19, #0x18\n" + "tbz x11, #0, 11f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 11f\n" + "ldr s10, [x28, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 10f\n" + "ldr d9, [x28], #0x8\n" + "mov x19, #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "11:" // Height 1: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 13f\n" + "12:" // Height 1: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "13:" // Height 1: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 15f\n" + "14:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "15:" // Height 1: setup done + "mov x27, #0x0\n" + "16:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 18f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "b 18f\n" + "17:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "18:" // Height 1: input setup done + "cmp x26, #0x4\n" + "blt 21f\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "cmp x26, #0x8\n" + "blt 20f\n" + "19:" // Height 1: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x8\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q6, [x10, #0x20]\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "bge 19b\n" + "20:" // Height 1: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q7, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "21:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 24f\n" + "cbz x26, 24f\n" + "tbz x26, #1, 22f\n" + "ldr d0, [x25], #0x8\n" + "tbz x26, #0, 23f\n" + "ld1 { v0.s }[2], [x25]\n" + "b 23f\n" + "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x25, #0x0]\n" + "23:" // Height 1: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "24:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 16b\n" + "uzp1 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "uzp1 v9.2d, v9.2d, v13.2d\n" + "uzp1 v10.2d, v10.2d, v14.2d\n" + "uzp1 v11.2d, v11.2d, v15.2d\n" + "tbz %x[flags], #1, 25f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "25:" // Height 1: No activation + "cmp x11, #0x10\n" + "bge 34f\n" + "tbz x11, #3, 29f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #2, 27f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #1, 26f\n" + "str d11, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v11.s }[2], [x28]\n" + "b 33f\n" + "26:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 33f\n" + "str s11, [x28, #0x0]\n" + "b 33f\n" + "27:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 28f\n" + "str d10, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v10.s }[2], [x28]\n" + "b 33f\n" + "28:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 33f\n" + "str s10, [x28, #0x0]\n" + "b 33f\n" + "29:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 31f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x11, #1, 30f\n" + "str d9, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v9.s }[2], [x28]\n" + "b 33f\n" + "30:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 33f\n" + "str s9, [x28, #0x0]\n" + "b 33f\n" + "31:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 32f\n" + "str d8, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v8.s }[2], [x28]\n" + "b 33f\n" + "32:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "33:" // Height 1: Partial direct writeback: Done + "b 35f\n" + "34:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "35:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 212f\n" + "36:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "37:" // Height 2: Column loop + "cbz x9, 38f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "add x9, x9, #0x40\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "b 50f\n" + "38:" // Height 2: no bias + "tbz %x[flags], #0, 49f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "bge 47f\n" + "tbz x11, #3, 42f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "tbz x11, #2, 40f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "tbz x11, #1, 39f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "tbz x11, #0, 46f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "b 46f\n" + "39:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 46f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "b 46f\n" + "40:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 41f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "tbz x11, #0, 46f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "b 46f\n" + "41:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 46f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "b 46f\n" + "42:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 44f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "tbz x11, #1, 43f\n" + "mov x19, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "tbz x11, #0, 46f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "b 46f\n" + "43:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 46f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "b 46f\n" + "44:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 45f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "tbz x11, #0, 46f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "b 46f\n" + "45:" // Height 2: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "46:" // Height 2: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 48f\n" + "47:" // Height 2: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "48:" // Height 2: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 50f\n" + "49:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "50:" // Height 2: setup done + "mov x27, #0x0\n" + "51:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 52f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 53f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 53f\n" + "52:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "53:" // Height 2: input setup done + "cmp x26, #0x4\n" + "blt 56f\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "cmp x26, #0x8\n" + "blt 55f\n" + "54:" // Height 2: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ldr q6, [x10, #0x0]\n" + "cmp x26, #0x8\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q7, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "bge 54b\n" + "55:" // Height 2: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "56:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 59f\n" + "cbz x26, 59f\n" + "tbz x26, #1, 57f\n" + "ldr d0, [x25], #0x8\n" + "ldr d1, [x24], #0x8\n" + "tbz x26, #0, 58f\n" + "ld1 { v0.s }[2], [x25]\n" + "ld1 { v1.s }[2], [x24]\n" + "b 58f\n" + "57:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x25, #0x0]\n" + "ldr s1, [x24, #0x0]\n" + "58:" // Height 2: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "59:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 51b\n" + "uzp1 v6.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "tbz %x[flags], #1, 60f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v6.4s, v6.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "60:" // Height 2: No activation + "cmp x11, #0x10\n" + "bge 69f\n" + "tbz x11, #3, 64f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "tbz x11, #2, 62f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "tbz x11, #1, 61f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "b 68f\n" + "61:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 68f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "b 68f\n" + "62:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 63f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "b 68f\n" + "63:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 68f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "b 68f\n" + "64:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 66f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "tbz x11, #1, 65f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "b 68f\n" + "65:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 68f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "b 68f\n" + "66:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 67f\n" + "str d6, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v6.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "b 68f\n" + "67:" // Height 2: Partial direct writeback: partial_1_0 + "str s6, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "68:" // Height 2: Partial direct writeback: Done + "b 70f\n" + "69:" // Height 2: Full writeback + "str q6, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "70:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 37b\n" + "b 212f\n" + "71:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "72:" // Height 3: Column loop + "cbz x9, 73f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "b 85f\n" + "73:" // Height 3: no bias + "tbz %x[flags], #0, 84f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "bge 82f\n" + "tbz x11, #3, 77f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #2, 75f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "tbz x11, #1, 74f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "b 81f\n" + "74:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 81f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "b 81f\n" + "75:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 76f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "b 81f\n" + "76:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 81f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "b 81f\n" + "77:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 79f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #1, 78f\n" + "mov x19, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "b 81f\n" + "78:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 81f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "b 81f\n" + "79:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 80f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "b 81f\n" + "80:" // Height 3: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "81:" // Height 3: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 83f\n" + "82:" // Height 3: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "83:" // Height 3: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 85f\n" + "84:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "85:" // Height 3: setup done + "mov x27, #0x0\n" + "86:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 87f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 88f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 88f\n" + "87:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "88:" // Height 3: input setup done + "cmp x26, #0x4\n" + "blt 91f\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "cmp x26, #0x8\n" + "blt 90f\n" + "89:" // Height 3: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "cmp x26, #0x8\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x25], #0x10\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "bge 89b\n" + "90:" // Height 3: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "91:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 94f\n" + "cbz x26, 94f\n" + "tbz x26, #1, 92f\n" + "ldr d0, [x25], #0x8\n" + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "tbz x26, #0, 93f\n" + "ld1 { v0.s }[2], [x25]\n" + "ld1 { v1.s }[2], [x24]\n" + "ld1 { v2.s }[2], [x23]\n" + "b 93f\n" + "92:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x25, #0x0]\n" + "ldr s1, [x24, #0x0]\n" + "ldr s2, [x23, #0x0]\n" + "93:" // Height 3: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "94:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 86b\n" + "uzp1 v6.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "tbz %x[flags], #1, 95f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v6.4s, v6.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "95:" // Height 3: No activation + "cmp x11, #0x10\n" + "bge 104f\n" + "tbz x11, #3, 99f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #2, 97f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #1, 96f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "b 103f\n" + "96:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 103f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "b 103f\n" + "97:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 98f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "b 103f\n" + "98:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 103f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "b 103f\n" + "99:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 101f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "tbz x11, #1, 100f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "b 103f\n" + "100:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 103f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "b 103f\n" + "101:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 102f\n" + "str d6, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v6.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "b 103f\n" + "102:" // Height 3: Partial direct writeback: partial_1_0 + "str s6, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "103:" // Height 3: Partial direct writeback: Done + "b 105f\n" + "104:" // Height 3: Full writeback + "str q6, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "105:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 72b\n" + "b 212f\n" + "106:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "107:" // Height 4: Column loop + "cbz x9, 108f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "b 120f\n" + "108:" // Height 4: no bias + "tbz %x[flags], #0, 119f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 117f\n" + "tbz x11, #3, 112f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "tbz x11, #2, 110f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "tbz x11, #1, 109f\n" + "mov x19, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "b 116f\n" + "109:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 116f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "b 116f\n" + "110:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 111f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "b 116f\n" + "111:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 116f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "b 116f\n" + "112:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 114f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "tbz x11, #1, 113f\n" + "mov x19, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "b 116f\n" + "113:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 116f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "b 116f\n" + "114:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 115f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "b 116f\n" + "115:" // Height 4: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "116:" // Height 4: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 118f\n" + "117:" // Height 4: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "118:" // Height 4: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 120f\n" + "119:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "120:" // Height 4: setup done + "mov x27, #0x0\n" + "121:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 122f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 123f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 123f\n" + "122:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "123:" // Height 4: input setup done + "cmp x26, #0x4\n" + "blt 126f\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "cmp x26, #0x8\n" + "blt 125f\n" + "124:" // Height 4: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "cmp x26, #0x8\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ld1 { v3.4s }, [x22], #0x10\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x25], #0x10\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "bge 124b\n" + "125:" // Height 4: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "ld1 { v3.4s }, [x22], #0x10\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q6, [x10, #0x20]\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "126:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 129f\n" + "cbz x26, 129f\n" + "tbz x26, #1, 127f\n" + "ldr d0, [x25], #0x8\n" + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "tbz x26, #0, 128f\n" + "ld1 { v0.s }[2], [x25]\n" + "ld1 { v1.s }[2], [x24]\n" + "ld1 { v2.s }[2], [x23]\n" + "ld1 { v3.s }[2], [x22]\n" + "b 128f\n" + "127:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x25, #0x0]\n" + "ldr s1, [x24, #0x0]\n" + "ldr s2, [x23, #0x0]\n" + "ldr s3, [x22, #0x0]\n" + "128:" // Height 4: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "129:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 121b\n" + "uzp1 v6.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "tbz %x[flags], #1, 130f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v6.4s, v6.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "130:" // Height 4: No activation + "cmp x11, #0x10\n" + "bge 139f\n" + "tbz x11, #3, 134f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x11, #2, 132f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x11, #1, 131f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 138f\n" + "131:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 138f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 138f\n" + "132:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 133f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 138f\n" + "133:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 138f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 138f\n" + "134:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 136f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x11, #1, 135f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 138f\n" + "135:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 138f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 138f\n" + "136:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 137f\n" + "str d6, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d15, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v6.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 138f\n" + "137:" // Height 4: Partial direct writeback: partial_1_0 + "str s6, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "138:" // Height 4: Partial direct writeback: Done + "b 140f\n" + "139:" // Height 4: Full writeback + "str q6, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q15, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "str q21, [x23, #0x20]\n" + "str q22, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "140:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 107b\n" + "b 212f\n" + "141:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "142:" // Height 5: Column loop + "cbz x9, 143f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "mov v24.16b, v8.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "mov v28.16b, v12.16b\n" + "mov v25.16b, v9.16b\n" + "mov v29.16b, v13.16b\n" + "mov v26.16b, v10.16b\n" + "mov v30.16b, v14.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v15.16b\n" + "b 155f\n" + "143:" // Height 5: no bias + "tbz %x[flags], #0, 154f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 152f\n" + "tbz x11, #3, 147f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #2, 145f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v27.4s }, [x21], #0x10\n" + "tbz x11, #1, 144f\n" + "ldr d16, [x28], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d6, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v6.s }[2], [x21]\n" + "b 151f\n" + "144:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 151f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s6, [x21, #0x0]\n" + "b 151f\n" + "145:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 146f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "b 151f\n" + "146:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 151f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "b 151f\n" + "147:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 149f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #1, 148f\n" + "ldr d10, [x28], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "b 151f\n" + "148:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 151f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "b 151f\n" + "149:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 150f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "b 151f\n" + "150:" // Height 5: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "151:" // Height 5: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 153f\n" + "152:" // Height 5: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q25, [x21, #0x0]\n" + "ldr q26, [x21, #0x10]\n" + "ldr q27, [x21, #0x20]\n" + "ldr q6, [x21, #0x30]\n" + "153:" // Height 5: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 155f\n" + "154:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "155:" // Height 5: setup done + "mov x27, #0x0\n" + "156:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 157f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 158f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 158f\n" + "157:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "158:" // Height 5: input setup done + "cmp x26, #0x4\n" + "blt 161f\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "cmp x26, #0x8\n" + "blt 160f\n" + "159:" // Height 5: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "cmp x26, #0x8\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ld1 { v3.4s }, [x22], #0x10\n" + "ld1 { v4.4s }, [x21], #0x10\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + "ldr q7, [x10, #0x10]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x25], #0x10\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "bge 159b\n" + "160:" // Height 5: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "ld1 { v3.4s }, [x22], #0x10\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ld1 { v4.4s }, [x21], #0x10\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "ldr q7, [x10, #0x10]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "161:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 164f\n" + "cbz x26, 164f\n" + "tbz x26, #1, 162f\n" + "ldr d0, [x25], #0x8\n" + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "ldr d4, [x21], #0x8\n" + "tbz x26, #0, 163f\n" + "ld1 { v0.s }[2], [x25]\n" + "ld1 { v1.s }[2], [x24]\n" + "ld1 { v2.s }[2], [x23]\n" + "ld1 { v3.s }[2], [x22]\n" + "ld1 { v4.s }[2], [x21]\n" + "b 163f\n" + "162:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x25, #0x0]\n" + "ldr s1, [x24, #0x0]\n" + "ldr s2, [x23, #0x0]\n" + "ldr s3, [x22, #0x0]\n" + "ldr s4, [x21, #0x0]\n" + "163:" // Height 5: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "164:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 156b\n" + "uzp1 v6.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "tbz %x[flags], #1, 165f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v6.4s, v6.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "165:" // Height 5: No activation + "cmp x11, #0x10\n" + "bge 174f\n" + "tbz x11, #3, 169f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #2, 167f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #1, 166f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "b 173f\n" + "166:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 173f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "b 173f\n" + "167:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 168f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "b 173f\n" + "168:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 173f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "b 173f\n" + "169:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 171f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "tbz x11, #1, 170f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "b 173f\n" + "170:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 173f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "b 173f\n" + "171:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 172f\n" + "str d6, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d15, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v6.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "b 173f\n" + "172:" // Height 5: Partial direct writeback: partial_1_0 + "str s6, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "173:" // Height 5: Partial direct writeback: Done + "b 175f\n" + "174:" // Height 5: Full writeback + "str q6, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q15, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "str q21, [x23, #0x20]\n" + "str q22, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "175:" // Height 5: Writeback done + "subs x11, x11, #0x10\n" + "bgt 142b\n" + "b 212f\n" + "176:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "177:" // Height 6: Column loop + "cbz x9, 178f\n" + "ldr q8, [x9, #0x0]\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "ldr q9, [x9, #0x10]\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "ldr q10, [x9, #0x20]\n" + "mov v16.16b, v8.16b\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "mov v20.16b, v12.16b\n" + "mov v24.16b, v8.16b\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "mov v28.16b, v12.16b\n" + "mov v25.16b, v9.16b\n" + "mov v29.16b, v13.16b\n" + "mov v26.16b, v10.16b\n" + "mov v30.16b, v14.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v15.16b\n" + "b 190f\n" + "178:" // Height 6: no bias + "tbz %x[flags], #0, 189f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 187f\n" + "tbz x11, #3, 182f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "ld1 { v29.4s }, [x20], #0x10\n" + "tbz x11, #2, 180f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v19.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v27.4s }, [x21], #0x10\n" + "ld1 { v30.4s }, [x20], #0x10\n" + "tbz x11, #1, 179f\n" + "ldr d16, [x28], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d6, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v6.s }[2], [x21]\n" + "ld1 { v31.s }[2], [x20]\n" + "b 186f\n" + "179:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x11, #0, 186f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s6, [x21, #0x0]\n" + "ldr s31, [x20, #0x0]\n" + "b 186f\n" + "180:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 181f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "mov x19, #0x28\n" + "ldr d19, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "ldr d30, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "ld1 { v30.s }[2], [x20]\n" + "b 186f\n" + "181:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x11, #0, 186f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "ldr s30, [x20, #0x0]\n" + "b 186f\n" + "182:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 184f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "tbz x11, #1, 183f\n" + "ldr d10, [x28], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "ldr d29, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "ld1 { v29.s }[2], [x20]\n" + "b 186f\n" + "183:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x11, #0, 186f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "ldr s29, [x20, #0x0]\n" + "b 186f\n" + "184:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 185f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "mov x19, #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "ldr d28, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "ld1 { v28.s }[2], [x20]\n" + "b 186f\n" + "185:" // Height 6: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "ldr s28, [x20, #0x0]\n" + "186:" // Height 6: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 188f\n" + "187:" // Height 6: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q17, [x23, #0x0]\n" + "ldr q18, [x23, #0x10]\n" + "ldr q19, [x23, #0x20]\n" + "ldr q24, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q25, [x21, #0x0]\n" + "ldr q26, [x21, #0x10]\n" + "ldr q27, [x21, #0x20]\n" + "ldr q6, [x21, #0x30]\n" + "ldr q28, [x20, #0x0]\n" + "ldr q29, [x20, #0x10]\n" + "ldr q30, [x20, #0x20]\n" + "ldr q31, [x20, #0x30]\n" + "188:" // Height 6: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 190f\n" + "189:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "190:" // Height 6: setup done + "mov x27, #0x0\n" + "191:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 192f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 193f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 193f\n" + "192:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "193:" // Height 6: input setup done + "cmp x26, #0x4\n" + "blt 196f\n" + "ld1 { v0.4s }, [x25], #0x10\n" + "cmp x26, #0x8\n" + "blt 195f\n" + "194:" // Height 6: Multiply loop: Main loop head + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "cmp x26, #0x8\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ld1 { v3.4s }, [x22], #0x10\n" + "ld1 { v4.4s }, [x21], #0x10\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "ld1 { v5.4s }, [x20], #0x10\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "ld1 { v0.4s }, [x25], #0x10\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "bge 194b\n" + "195:" // Height 6: Multiply loop: Single iteration only + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ld1 { v1.4s }, [x24], #0x10\n" + "sub x26, x26, #0x4\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + "ld1 { v2.4s }, [x23], #0x10\n" + "ld1 { v3.4s }, [x22], #0x10\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ld1 { v4.4s }, [x21], #0x10\n" + "ld1 { v5.4s }, [x20], #0x10\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "196:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 199f\n" + "cbz x26, 199f\n" + "tbz x26, #1, 197f\n" + "ldr d0, [x25], #0x8\n" + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "ldr d4, [x21], #0x8\n" + "ldr d5, [x20], #0x8\n" + "tbz x26, #0, 198f\n" + "ld1 { v0.s }[2], [x25]\n" + "ld1 { v1.s }[2], [x24]\n" + "ld1 { v2.s }[2], [x23]\n" + "ld1 { v3.s }[2], [x22]\n" + "ld1 { v4.s }[2], [x21]\n" + "ld1 { v5.s }[2], [x20]\n" + "b 198f\n" + "197:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr s0, [x25, #0x0]\n" + "ldr s1, [x24, #0x0]\n" + "ldr s2, [x23, #0x0]\n" + "ldr s3, [x22, #0x0]\n" + "ldr s4, [x21, #0x0]\n" + "ldr s5, [x20, #0x0]\n" + "198:" // Height 6: Multiply loop: Ragged operand read: Done + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" + ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" + ".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "199:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 191b\n" + "uzp1 v6.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "tbz %x[flags], #1, 200f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v6.4s, v6.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "200:" // Height 6: No activation + "cmp x11, #0x10\n" + "bge 209f\n" + "tbz x11, #3, 204f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v9.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v23.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x11, #2, 202f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x24], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x11, #1, 201f\n" + "str d14, [x28], #0x8\n" + "str d11, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 208f\n" + "201:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 208f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s30, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 208f\n" + "202:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 203f\n" + "str d13, [x28], #0x8\n" + "str d10, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d29, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v29.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 208f\n" + "203:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 208f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s29, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 208f\n" + "204:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 206f\n" + "st1 { v6.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x24], #0x10\n" + "st1 { v15.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v23.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x11, #1, 205f\n" + "str d12, [x28], #0x8\n" + "str d9, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v28.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 208f\n" + "205:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 208f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s28, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 208f\n" + "206:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 207f\n" + "str d6, [x28], #0x8\n" + "str d8, [x24], #0x8\n" + "str d15, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v6.s }[2], [x28]\n" + "st1 { v8.s }[2], [x24]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 208f\n" + "207:" // Height 6: Partial direct writeback: partial_1_0 + "str s6, [x28, #0x0]\n" + "str s8, [x24, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "208:" // Height 6: Partial direct writeback: Done + "b 210f\n" + "209:" // Height 6: Full writeback + "str q6, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x24, #0x0]\n" + "str q9, [x24, #0x10]\n" + "str q10, [x24, #0x20]\n" + "str q11, [x24, #0x30]\n" + "str q15, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "str q21, [x23, #0x20]\n" + "str q22, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q23, [x21, #0x0]\n" + "str q28, [x21, #0x10]\n" + "str q29, [x21, #0x20]\n" + "str q30, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "210:" // Height 6: Writeback done + "subs x11, x11, #0x10\n" + "bgt 177b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 212f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 211f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "211:" // Update direct input + "mov x19, #0x18\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "212:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp index caef6396be..94f5783686 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -44,7 +44,8 @@ void a64_hybrid_s8qa_dot_4x16_a55( ARGLIST ); class cls_a64_hybrid_s8qa_dot_4x16 { public: - typedef int8_t operand_type; + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; typedef int8_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -70,16 +71,24 @@ public: return false; } - StdTransformsFixed transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 7.5301 }; - default: - return { 27.5482 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 7.5301 }; + default: + return { 27.5482 }; + case CPUModel::A510: + return { 14.81 }; + case CPUModel::V1: + return { 48.34 }; + } } + + return { 1.0 }; } // Default to the generic kernel @@ -99,4 +108,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp index 11aa05a9b7..ee7e55f179 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp @@ -406,10 +406,10 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "b 122f\n" "31:" // Height 2 "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" - "movi v15.16b, #0x1\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "movi v12.4s, #0x0\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "movi v15.16b, #0x1\n" "mov x9, %x[col_bias]\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov x28, %x[output_ptr]\n" @@ -853,12 +853,12 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "b 122f\n" "61:" // Height 3 "movi v11.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "movi v12.4s, #0x0\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v13.4s, #0x0\n" - "movi v15.16b, #0x1\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "mov x9, %x[col_bias]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov x28, %x[output_ptr]\n" "62:" // Height 3: Column loop @@ -1426,14 +1426,14 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "b 122f\n" "91:" // Height 4 "movi v11.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "movi v12.4s, #0x0\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v13.4s, #0x0\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "movi v14.4s, #0x0\n" - "movi v15.16b, #0x1\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "mov x9, %x[col_bias]\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov x28, %x[output_ptr]\n" "mov x19, #0x4\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp index 0adfb99f23..a1c4b34d38 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp @@ -283,16 +283,16 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqrdmulh v19.4s, v19.4s, v4.4s\n" "tbz %x[flags], #5, 20f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v16.4s, v16.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v5.4s\n" "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "20:" // Height 1: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" @@ -612,8 +612,8 @@ void a64_hybrid_s8qa_dot_4x16 ( "ld1r { v2.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "addp v12.4s, v12.4s, v12.4s\n" "neg v2.4s, v2.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" "mul v11.4s, v11.4s, v2.4s\n" "mul v12.4s, v12.4s, v2.4s\n" "49:" // Height 2: skip row sum fixup @@ -653,27 +653,27 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqrdmulh v23.4s, v23.4s, v4.4s\n" "tbz %x[flags], #5, 50f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" "and v7.16b, v19.16b, v0.16b\n" "and v8.16b, v20.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "and v9.16b, v21.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v10.16b, v22.16b, v0.16b\n" "sshr v8.4s, v8.4s, #0x1f\n" - "and v4.16b, v23.16b, v0.16b\n" "sshr v9.4s, v9.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "sqadd v20.4s, v20.4s, v8.4s\n" "sqadd v21.4s, v21.4s, v9.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" "50:" // Height 2: no shift correction @@ -690,8 +690,6 @@ void a64_hybrid_s8qa_dot_4x16 ( "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "srshl v22.4s, v22.4s, v0.4s\n" - "srshl v23.4s, v23.4s, v0.4s\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -710,16 +708,18 @@ void a64_hybrid_s8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "smax v20.4s, v20.4s, v5.4s\n" "smax v21.4s, v21.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "add v22.4s, v22.4s, v4.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" "smin v22.4s, v22.4s, v6.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" "smax v22.4s, v22.4s, v5.4s\n" "smax v23.4s, v23.4s, v5.4s\n" - "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v20.16b, v20.16b, v21.16b\n" "bge 59f\n" @@ -1094,9 +1094,9 @@ void a64_hybrid_s8qa_dot_4x16 ( "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v11.4s, v11.4s, v11.4s\n" + "neg v3.4s, v3.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v3.4s, v3.4s\n" "mul v11.4s, v11.4s, v3.4s\n" "mul v12.4s, v12.4s, v3.4s\n" "mul v13.4s, v13.4s, v3.4s\n" @@ -1149,39 +1149,39 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqrdmulh v27.4s, v27.4s, v4.4s\n" "tbz %x[flags], #5, 80f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" "and v7.16b, v19.16b, v0.16b\n" "and v8.16b, v20.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "and v9.16b, v21.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v10.16b, v22.16b, v0.16b\n" "sshr v8.4s, v8.4s, #0x1f\n" - "and v4.16b, v23.16b, v0.16b\n" "sshr v9.4s, v9.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "sqadd v20.4s, v20.4s, v8.4s\n" "sqadd v21.4s, v21.4s, v9.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v24.4s, v24.4s, v5.4s\n" + "and v6.16b, v25.16b, v0.16b\n" "and v7.16b, v26.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "and v8.16b, v27.16b, v0.16b\n" - "sqadd v25.4s, v25.4s, v6.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v6.4s\n" "sqadd v26.4s, v26.4s, v7.4s\n" "sqadd v27.4s, v27.4s, v8.4s\n" "80:" // Height 3: no shift correction @@ -1198,8 +1198,6 @@ void a64_hybrid_s8qa_dot_4x16 ( "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "srshl v22.4s, v22.4s, v0.4s\n" - "srshl v23.4s, v23.4s, v0.4s\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -1218,31 +1216,33 @@ void a64_hybrid_s8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "smax v20.4s, v20.4s, v5.4s\n" "smax v21.4s, v21.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" "add v22.4s, v22.4s, v4.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "srshl v24.4s, v24.4s, v0.4s\n" + "add v24.4s, v24.4s, v4.4s\n" "smin v22.4s, v22.4s, v6.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "srshl v25.4s, v25.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" "smax v22.4s, v22.4s, v5.4s\n" "smax v23.4s, v23.4s, v5.4s\n" - "add v24.4s, v24.4s, v4.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" "add v25.4s, v25.4s, v4.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" "add v26.4s, v26.4s, v4.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" "add v27.4s, v27.4s, v4.4s\n" - "uzp1 v16.8h, v16.8h, v17.8h\n" "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" "uzp1 v17.8h, v18.8h, v19.8h\n" + "smin v27.4s, v27.4s, v6.4s\n" "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v20.8h, v20.8h, v21.8h\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" "uzp1 v25.8h, v26.8h, v27.8h\n" @@ -1705,10 +1705,10 @@ void a64_hybrid_s8qa_dot_4x16 ( "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" "addp v11.4s, v11.4s, v11.4s\n" + "neg v4.4s, v4.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "neg v4.4s, v4.4s\n" "mul v11.4s, v11.4s, v4.4s\n" "mul v12.4s, v12.4s, v4.4s\n" "mul v13.4s, v13.4s, v4.4s\n" @@ -1774,52 +1774,52 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqrdmulh v31.4s, v31.4s, v4.4s\n" "tbz %x[flags], #5, 110f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" "and v7.16b, v19.16b, v0.16b\n" "and v8.16b, v20.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "and v9.16b, v21.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v10.16b, v22.16b, v0.16b\n" "sshr v8.4s, v8.4s, #0x1f\n" - "and v4.16b, v23.16b, v0.16b\n" "sshr v9.4s, v9.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "sqadd v20.4s, v20.4s, v8.4s\n" "sqadd v21.4s, v21.4s, v9.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v24.4s, v24.4s, v5.4s\n" + "and v6.16b, v25.16b, v0.16b\n" "and v7.16b, v26.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "and v8.16b, v27.16b, v0.16b\n" - "and v9.16b, v28.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v8.4s, v8.4s, #0x1f\n" "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "and v9.16b, v28.16b, v0.16b\n" "and v10.16b, v29.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" "and v4.16b, v30.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "and v5.16b, v31.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v27.4s, v27.4s, v8.4s\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v28.4s, v28.4s, v9.4s\n" "sqadd v29.4s, v29.4s, v10.4s\n" "sqadd v30.4s, v30.4s, v4.4s\n" + "and v5.16b, v31.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v31.4s, v31.4s, v5.4s\n" "110:" // Height 4: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" @@ -1835,8 +1835,6 @@ void a64_hybrid_s8qa_dot_4x16 ( "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "srshl v22.4s, v22.4s, v0.4s\n" - "srshl v23.4s, v23.4s, v0.4s\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -1855,45 +1853,47 @@ void a64_hybrid_s8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "smax v20.4s, v20.4s, v5.4s\n" "smax v21.4s, v21.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" "add v22.4s, v22.4s, v4.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "srshl v24.4s, v24.4s, v0.4s\n" + "add v24.4s, v24.4s, v4.4s\n" "smin v22.4s, v22.4s, v6.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "srshl v25.4s, v25.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" "smax v22.4s, v22.4s, v5.4s\n" "smax v23.4s, v23.4s, v5.4s\n" - "add v24.4s, v24.4s, v4.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" "add v25.4s, v25.4s, v4.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" "add v26.4s, v26.4s, v4.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" "add v27.4s, v27.4s, v4.4s\n" - "srshl v28.4s, v28.4s, v0.4s\n" "smin v26.4s, v26.4s, v6.4s\n" + "add v28.4s, v28.4s, v4.4s\n" "smin v27.4s, v27.4s, v6.4s\n" - "srshl v29.4s, v29.4s, v0.4s\n" "smax v26.4s, v26.4s, v5.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" "smax v27.4s, v27.4s, v5.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" "add v30.4s, v30.4s, v4.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" "add v31.4s, v31.4s, v4.4s\n" - "uzp1 v16.8h, v16.8h, v17.8h\n" "smin v30.4s, v30.4s, v6.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" "smin v31.4s, v31.4s, v6.4s\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" "smax v30.4s, v30.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v20.8h, v20.8h, v21.8h\n" "uzp1 v21.8h, v22.8h, v23.8h\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp new file mode 100644 index 0000000000..bc933afd9a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_s8qa_mmla_4x16( ARGLIST ); + +class cls_a64_hybrid_s8qa_mmla_4x16 +{ +public: + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 47.74 }; + case CPUModel::A510: + return { 27.99 }; + case CPUModel::V1: + return { 68.76 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8qa_mmla_4x16; + cls_a64_hybrid_s8qa_mmla_4x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp new file mode 100644 index 0000000000..4bc807cd8e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp @@ -0,0 +1,2104 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_s8qa_mmla_4x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 97f\n" + "cmp %x[M], #0x2\n" + "bgt 65f\n" + "beq 33f\n" + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "movi v15.16b, #0x1\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "blt 11f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q5, [x28, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q6, [x28, #0x10]\n" + "ldr q7, [x28, #0x20]\n" + "ldr q8, [x28, #0x30]\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + "blt 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "add x23, x23, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 8f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "ldr q1, [x23, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q5, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" + "ldr q7, [x28, #0x20]\n" + "ldr q8, [x28, #0x30]\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + "bge 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "sub x24, x24, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "add x23, x23, #0x10\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 10f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + "10:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "11:" // Height 1: Multiply loop: Main loop skip + "cbz x24, 20f\n" + "cmp x24, #0x8\n" + "blt 14f\n" + "12:" // Height 1: Multiply loop: Odd block loop + "movi v2.16b, #0x0\n" + "ldr d1, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 13f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "13:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x30]\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + "bge 12b\n" + "cbz x24, 20f\n" + "14:" // Height 1: Multiply loop: Skip odd blocks + "tbz x24, #2, 16f\n" + "ldr s1, [x23], #0x4\n" + "tbz x24, #1, 15f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "tbz x24, #0, 18f\n" + "ld1 { v1.b }[6], [x23]\n" + "b 18f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 18f\n" + "ld1 { v1.b }[4], [x23]\n" + "b 18f\n" + "16:" // Height 1: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 17f\n" + "ldr h1, [x23], #0x2\n" + "tbz x24, #0, 18f\n" + "ld1 { v1.b }[2], [x23]\n" + "b 18f\n" + "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "18:" // Height 1: Multiply loop: Ragged operand read: Done + "movi v2.16b, #0x0\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 19f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "19:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x40]\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x60]\n" + "ldr q10, [x28, #0x70]\n" + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + "20:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 4b\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "mov v23.16b, v16.16b\n" + "tbnz %x[flags], #31, 21f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v1.4s }, [x22]\n" + "dup v11.4s, v11.s[0]\n" + "neg v1.4s, v1.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" + "21:" // Height 1: skip row sum fixup + "add v23.4s, v23.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v23.4s, v23.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 22f\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "22:" // Height 1: no shift correction + "srshl v23.4s, v23.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v23.8h, v23.8h, v17.8h\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v17.16b\n" + "bge 31f\n" + "tbz x9, #3, 26f\n" + "str d23, [x26], #0x8\n" + "tbz x9, #2, 24f\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "tbz x9, #1, 23f\n" + "st1 { v23.h }[6], [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[14], [x26]\n" + "b 30f\n" + "23:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x9, #0, 30f\n" + "st1 { v23.b }[12], [x26]\n" + "b 30f\n" + "24:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x9, #1, 25f\n" + "st1 { v23.h }[4], [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[10], [x26]\n" + "b 30f\n" + "25:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x9, #0, 30f\n" + "st1 { v23.b }[8], [x26]\n" + "b 30f\n" + "26:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x9, #2, 28f\n" + "str s23, [x26], #0x4\n" + "tbz x9, #1, 27f\n" + "st1 { v23.h }[2], [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[6], [x26]\n" + "b 30f\n" + "27:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x9, #0, 30f\n" + "st1 { v23.b }[4], [x26]\n" + "b 30f\n" + "28:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x9, #1, 29f\n" + "str h23, [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[2], [x26]\n" + "b 30f\n" + "29:" // Height 1: Partial direct writeback: partial_1_0 + "str b23, [x26, #0x0]\n" + "30:" // Height 1: Partial direct writeback: Done + "b 32f\n" + "31:" // Height 1: Full writeback + "str q23, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "32:" // Height 1: Writeback done + "subs x9, x9, #0x10\n" + "bgt 2b\n" + "b 130f\n" + "33:" // Height 2 + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v15.16b, #0x1\n" + "mov x26, %x[output_ptr]\n" + "34:" // Height 2: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "35:" // Height 2: setup done + "mov x25, #0x0\n" + "36:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 37f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 38f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 38f\n" + "37:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "38:" // Height 2: input setup done + "cmp x24, #0x10\n" + "blt 43f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "blt 41f\n" + "39:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x22, x22, #0x10\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + "ldr q10, [x28, #0x50]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + "ldr q5, [x28, #0x70]\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + "ldr q7, [x28, #0x90]\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q10, [x28, #0xc0]\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + "40:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "bge 39b\n" + "41:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "sub x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + "add x22, x22, #0x10\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + "ldr q8, [x28, #0x30]\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + "ldr q5, [x28, #0x70]\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + "ldr q8, [x28, #0xa0]\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q10, [x28, #0xc0]\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 42f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + "42:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "43:" // Height 2: Multiply loop: Main loop skip + "cbz x24, 52f\n" + "cmp x24, #0x8\n" + "blt 46f\n" + "44:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x23], #0x8\n" + "ldr d2, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 45f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "45:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x30]\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + "bge 44b\n" + "cbz x24, 52f\n" + "46:" // Height 2: Multiply loop: Skip odd blocks + "tbz x24, #2, 48f\n" + "ldr s1, [x23], #0x4\n" + "ldr s2, [x22], #0x4\n" + "tbz x24, #1, 47f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "ld1 { v2.h }[2], [x22], #0x2\n" + "tbz x24, #0, 50f\n" + "ld1 { v1.b }[6], [x23]\n" + "ld1 { v2.b }[6], [x22]\n" + "b 50f\n" + "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 50f\n" + "ld1 { v1.b }[4], [x23]\n" + "ld1 { v2.b }[4], [x22]\n" + "b 50f\n" + "48:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 49f\n" + "ldr h1, [x23], #0x2\n" + "ldr h2, [x22], #0x2\n" + "tbz x24, #0, 50f\n" + "ld1 { v1.b }[2], [x23]\n" + "ld1 { v2.b }[2], [x22]\n" + "b 50f\n" + "49:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "50:" // Height 2: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 51f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "51:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x40]\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x60]\n" + "ldr q10, [x28, #0x70]\n" + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + "52:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 36b\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "mov v23.16b, v4.16b\n" + "tbnz %x[flags], #31, 53f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x22]\n" + "dup v12.4s, v11.s[3]\n" + "dup v11.4s, v11.s[0]\n" + "neg v2.4s, v2.4s\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "53:" // Height 2: skip row sum fixup + "add v23.4s, v23.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v20.4s, v20.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v21.4s, v21.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v22.4s, v22.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v16.4s, v16.4s, v12.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v12.4s\n" + "add v18.4s, v18.4s, v12.4s\n" + "add v19.4s, v19.4s, v12.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 54f\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "and v7.16b, v22.16b, v0.16b\n" + "and v8.16b, v16.16b, v0.16b\n" + "and v9.16b, v17.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "sqadd v16.4s, v16.4s, v8.4s\n" + "sqadd v17.4s, v17.4s, v9.4s\n" + "and v10.16b, v18.16b, v0.16b\n" + "and v4.16b, v19.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "54:" // Height 2: no shift correction + "srshl v23.4s, v23.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "uzp1 v23.8h, v23.8h, v20.8h\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 63f\n" + "tbz x9, #3, 58f\n" + "str d23, [x26], #0x8\n" + "str d16, [x21], #0x8\n" + "tbz x9, #2, 56f\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "tbz x9, #1, 55f\n" + "st1 { v23.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[14], [x26]\n" + "st1 { v16.b }[14], [x21]\n" + "b 62f\n" + "55:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x9, #0, 62f\n" + "st1 { v23.b }[12], [x26]\n" + "st1 { v16.b }[12], [x21]\n" + "b 62f\n" + "56:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x9, #1, 57f\n" + "st1 { v23.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[10], [x26]\n" + "st1 { v16.b }[10], [x21]\n" + "b 62f\n" + "57:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x9, #0, 62f\n" + "st1 { v23.b }[8], [x26]\n" + "st1 { v16.b }[8], [x21]\n" + "b 62f\n" + "58:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x9, #2, 60f\n" + "str s23, [x26], #0x4\n" + "str s16, [x21], #0x4\n" + "tbz x9, #1, 59f\n" + "st1 { v23.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[6], [x26]\n" + "st1 { v16.b }[6], [x21]\n" + "b 62f\n" + "59:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x9, #0, 62f\n" + "st1 { v23.b }[4], [x26]\n" + "st1 { v16.b }[4], [x21]\n" + "b 62f\n" + "60:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x9, #1, 61f\n" + "str h23, [x26], #0x2\n" + "str h16, [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[2], [x26]\n" + "st1 { v16.b }[2], [x21]\n" + "b 62f\n" + "61:" // Height 2: Partial direct writeback: partial_1_0 + "str b23, [x26, #0x0]\n" + "str b16, [x21, #0x0]\n" + "62:" // Height 2: Partial direct writeback: Done + "b 64f\n" + "63:" // Height 2: Full writeback + "str q23, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q16, [x21, #0x0]\n" + "64:" // Height 2: Writeback done + "subs x9, x9, #0x10\n" + "bgt 34b\n" + "b 130f\n" + "65:" // Height 3 + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "mov x26, %x[output_ptr]\n" + "movi v15.16b, #0x1\n" + "66:" // Height 3: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "67:" // Height 3: setup done + "mov x25, #0x0\n" + "68:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 69f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 70f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 70f\n" + "69:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "70:" // Height 3: input setup done + "cmp x24, #0x10\n" + "blt 75f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "blt 73f\n" + "71:" // Height 3: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x20]\n" + "add x21, x21, #0x10\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e88a479 // smmla v25.4s, v3.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 72f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" + "72:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "bge 71b\n" + "73:" // Height 3: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "sub x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x20]\n" + "add x22, x22, #0x10\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + "add x21, x21, #0x10\n" + ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e88a479 // smmla v25.4s, v3.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 74f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" + "74:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "75:" // Height 3: Multiply loop: Main loop skip + "cbz x24, 84f\n" + "cmp x24, #0x8\n" + "blt 78f\n" + "76:" // Height 3: Multiply loop: Odd block loop + "movi v7.16b, #0x0\n" + "ldr d1, [x23], #0x8\n" + "ldr d2, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x21], #0x8\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 77f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "77:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n" + "bge 76b\n" + "cbz x24, 84f\n" + "78:" // Height 3: Multiply loop: Skip odd blocks + "tbz x24, #2, 80f\n" + "ldr s1, [x23], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr s3, [x21], #0x4\n" + "tbz x24, #1, 79f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "ld1 { v2.h }[2], [x22], #0x2\n" + "ld1 { v3.h }[2], [x21], #0x2\n" + "tbz x24, #0, 82f\n" + "ld1 { v1.b }[6], [x23]\n" + "ld1 { v2.b }[6], [x22]\n" + "ld1 { v3.b }[6], [x21]\n" + "b 82f\n" + "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 82f\n" + "ld1 { v1.b }[4], [x23]\n" + "ld1 { v2.b }[4], [x22]\n" + "ld1 { v3.b }[4], [x21]\n" + "b 82f\n" + "80:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 81f\n" + "ldr h1, [x23], #0x2\n" + "ldr h2, [x22], #0x2\n" + "ldr h3, [x21], #0x2\n" + "tbz x24, #0, 82f\n" + "ld1 { v1.b }[2], [x23]\n" + "ld1 { v2.b }[2], [x22]\n" + "ld1 { v3.b }[2], [x21]\n" + "b 82f\n" + "81:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "ldr b3, [x21, #0x0]\n" + "82:" // Height 3: Multiply loop: Ragged operand read: Done + "movi v9.16b, #0x0\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v9.2d\n" + "tbnz %x[flags], #31, 83f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "83:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n" + "ldr q5, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + "ldr q9, [x28, #0x60]\n" + ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" + "ldr q10, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n" + "84:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 68b\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "add x20, x21, x19\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 85f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x22]\n" + "addp v13.4s, v13.4s, v13.4s\n" + "dup v12.4s, v11.s[3]\n" + "dup v11.4s, v11.s[0]\n" + "neg v3.4s, v3.4s\n" + "dup v13.4s, v13.s[0]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "85:" // Height 3: skip row sum fixup + "add v31.4s, v31.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v20.4s, v20.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v21.4s, v21.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v22.4s, v22.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v16.4s, v16.4s, v12.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v12.4s\n" + "add v18.4s, v18.4s, v12.4s\n" + "add v19.4s, v19.4s, v12.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 86f\n" + "and v4.16b, v31.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v4.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "and v7.16b, v22.16b, v0.16b\n" + "and v8.16b, v16.16b, v0.16b\n" + "and v9.16b, v17.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "sqadd v16.4s, v16.4s, v8.4s\n" + "sqadd v17.4s, v17.4s, v9.4s\n" + "and v10.16b, v18.16b, v0.16b\n" + "and v4.16b, v19.16b, v0.16b\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "and v6.16b, v25.16b, v0.16b\n" + "and v7.16b, v26.16b, v0.16b\n" + "and v8.16b, v27.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "86:" // Height 3: no shift correction + "srshl v31.4s, v31.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "uzp1 v31.8h, v31.8h, v20.8h\n" + "add v26.4s, v26.4s, v4.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v20.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 95f\n" + "tbz x9, #3, 90f\n" + "str d31, [x26], #0x8\n" + "str d16, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x9, #2, 88f\n" + "st1 { v31.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x9, #1, 87f\n" + "st1 { v31.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[14], [x26]\n" + "st1 { v16.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 94f\n" + "87:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x9, #0, 94f\n" + "st1 { v31.b }[12], [x26]\n" + "st1 { v16.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 94f\n" + "88:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x9, #1, 89f\n" + "st1 { v31.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[10], [x26]\n" + "st1 { v16.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 94f\n" + "89:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x9, #0, 94f\n" + "st1 { v31.b }[8], [x26]\n" + "st1 { v16.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 94f\n" + "90:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x9, #2, 92f\n" + "str s31, [x26], #0x4\n" + "str s16, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x9, #1, 91f\n" + "st1 { v31.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[6], [x26]\n" + "st1 { v16.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 94f\n" + "91:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x9, #0, 94f\n" + "st1 { v31.b }[4], [x26]\n" + "st1 { v16.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 94f\n" + "92:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x9, #1, 93f\n" + "str h31, [x26], #0x2\n" + "str h16, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[2], [x26]\n" + "st1 { v16.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 94f\n" + "93:" // Height 3: Partial direct writeback: partial_1_0 + "str b31, [x26, #0x0]\n" + "str b16, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "94:" // Height 3: Partial direct writeback: Done + "b 96f\n" + "95:" // Height 3: Full writeback + "str q31, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q16, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "96:" // Height 3: Writeback done + "subs x9, x9, #0x10\n" + "bgt 66b\n" + "b 130f\n" + "97:" // Height 4 + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" + "movi v14.4s, #0x0\n" + "mov x19, #0x4\n" + "movi v15.16b, #0x1\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "98:" // Height 4: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "99:" // Height 4: setup done + "mov x25, #0x0\n" + "100:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 101f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x25, 102f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 102f\n" + "101:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "102:" // Height 4: input setup done + "cmp x24, #0x10\n" + "blt 107f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "blt 105f\n" + "103:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x20, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x21, x21, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x20, x20, #0x10\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e88a479 // smmla v25.4s, v3.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 104f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" + "104:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "bge 103b\n" + "105:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "sub x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x20, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x21, x21, #0x10\n" + ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + "add x20, x20, #0x10\n" + ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e88a479 // smmla v25.4s, v3.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 106f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" + "106:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "107:" // Height 4: Multiply loop: Main loop skip + "cbz x24, 116f\n" + "cmp x24, #0x8\n" + "blt 110f\n" + "108:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x23], #0x8\n" + "ldr d2, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x21], #0x8\n" + "ldr d7, [x20], #0x8\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 109f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "109:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n" + "bge 108b\n" + "cbz x24, 116f\n" + "110:" // Height 4: Multiply loop: Skip odd blocks + "tbz x24, #2, 112f\n" + "ldr s1, [x23], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr s3, [x21], #0x4\n" + "ldr s9, [x20], #0x4\n" + "tbz x24, #1, 111f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "ld1 { v2.h }[2], [x22], #0x2\n" + "ld1 { v3.h }[2], [x21], #0x2\n" + "ld1 { v9.h }[2], [x20], #0x2\n" + "tbz x24, #0, 114f\n" + "ld1 { v1.b }[6], [x23]\n" + "ld1 { v2.b }[6], [x22]\n" + "ld1 { v3.b }[6], [x21]\n" + "ld1 { v9.b }[6], [x20]\n" + "b 114f\n" + "111:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 114f\n" + "ld1 { v1.b }[4], [x23]\n" + "ld1 { v2.b }[4], [x22]\n" + "ld1 { v3.b }[4], [x21]\n" + "ld1 { v9.b }[4], [x20]\n" + "b 114f\n" + "112:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 113f\n" + "ldr h1, [x23], #0x2\n" + "ldr h2, [x22], #0x2\n" + "ldr h3, [x21], #0x2\n" + "ldr h9, [x20], #0x2\n" + "tbz x24, #0, 114f\n" + "ld1 { v1.b }[2], [x23]\n" + "ld1 { v2.b }[2], [x22]\n" + "ld1 { v3.b }[2], [x21]\n" + "ld1 { v9.b }[2], [x20]\n" + "b 114f\n" + "113:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "ldr b3, [x21, #0x0]\n" + "ldr b9, [x20, #0x0]\n" + "114:" // Height 4: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v9.2d\n" + "tbnz %x[flags], #31, 115f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "115:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n" + "ldr q5, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + "ldr q9, [x28, #0x60]\n" + ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" + "ldr q10, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n" + "116:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 100b\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "add x20, x21, x19\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x19, #0x0]\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 117f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "addp v13.4s, v13.4s, v13.4s\n" + "dup v12.4s, v11.s[3]\n" + "dup v11.4s, v11.s[0]\n" + "neg v4.4s, v4.4s\n" + "dup v14.4s, v13.s[3]\n" + "dup v13.4s, v13.s[0]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "117:" // Height 4: skip row sum fixup + "add v31.4s, v31.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v20.4s, v20.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v21.4s, v21.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v22.4s, v22.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v16.4s, v16.4s, v12.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v12.4s\n" + "add v18.4s, v18.4s, v12.4s\n" + "add v19.4s, v19.4s, v12.4s\n" + "add v23.4s, v23.4s, v13.4s\n" + "add v28.4s, v28.4s, v13.4s\n" + "add v29.4s, v29.4s, v13.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "add v24.4s, v24.4s, v14.4s\n" + "add v25.4s, v25.4s, v14.4s\n" + "add v26.4s, v26.4s, v14.4s\n" + "add v27.4s, v27.4s, v14.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v28.4s, v28.4s, v1.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v4.4s\n" + "sqrdmulh v30.4s, v30.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 118f\n" + "and v4.16b, v31.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v4.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "and v7.16b, v22.16b, v0.16b\n" + "and v8.16b, v16.16b, v0.16b\n" + "and v9.16b, v17.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "sqadd v16.4s, v16.4s, v8.4s\n" + "sqadd v17.4s, v17.4s, v9.4s\n" + "and v10.16b, v18.16b, v0.16b\n" + "and v4.16b, v19.16b, v0.16b\n" + "and v5.16b, v23.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sqadd v23.4s, v23.4s, v5.4s\n" + "and v6.16b, v28.16b, v0.16b\n" + "and v7.16b, v29.16b, v0.16b\n" + "and v8.16b, v30.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v6.4s\n" + "sqadd v29.4s, v29.4s, v7.4s\n" + "sqadd v30.4s, v30.4s, v8.4s\n" + "and v9.16b, v24.16b, v0.16b\n" + "and v10.16b, v25.16b, v0.16b\n" + "and v4.16b, v26.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v10.4s\n" + "sqadd v26.4s, v26.4s, v4.4s\n" + "and v5.16b, v27.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v5.4s\n" + "118:" // Height 4: no shift correction + "srshl v31.4s, v31.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "srshl v30.4s, v30.4s, v0.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "uzp1 v31.8h, v31.8h, v20.8h\n" + "add v26.4s, v26.4s, v4.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.8h, v23.8h, v28.8h\n" + "uzp1 v28.8h, v29.8h, v30.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v20.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v23.16b, v23.16b, v28.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 127f\n" + "tbz x9, #3, 122f\n" + "str d31, [x26], #0x8\n" + "str d16, [x21], #0x8\n" + "str d23, [x20], #0x8\n" + "str d24, [x19], #0x8\n" + "tbz x9, #2, 120f\n" + "st1 { v31.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "st1 { v23.s }[2], [x20], #0x4\n" + "st1 { v24.s }[2], [x19], #0x4\n" + "tbz x9, #1, 119f\n" + "st1 { v31.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "st1 { v23.h }[6], [x20], #0x2\n" + "st1 { v24.h }[6], [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[14], [x26]\n" + "st1 { v16.b }[14], [x21]\n" + "st1 { v23.b }[14], [x20]\n" + "st1 { v24.b }[14], [x19]\n" + "b 126f\n" + "119:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x9, #0, 126f\n" + "st1 { v31.b }[12], [x26]\n" + "st1 { v16.b }[12], [x21]\n" + "st1 { v23.b }[12], [x20]\n" + "st1 { v24.b }[12], [x19]\n" + "b 126f\n" + "120:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x9, #1, 121f\n" + "st1 { v31.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "st1 { v23.h }[4], [x20], #0x2\n" + "st1 { v24.h }[4], [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[10], [x26]\n" + "st1 { v16.b }[10], [x21]\n" + "st1 { v23.b }[10], [x20]\n" + "st1 { v24.b }[10], [x19]\n" + "b 126f\n" + "121:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x9, #0, 126f\n" + "st1 { v31.b }[8], [x26]\n" + "st1 { v16.b }[8], [x21]\n" + "st1 { v23.b }[8], [x20]\n" + "st1 { v24.b }[8], [x19]\n" + "b 126f\n" + "122:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x9, #2, 124f\n" + "str s31, [x26], #0x4\n" + "str s16, [x21], #0x4\n" + "str s23, [x20], #0x4\n" + "str s24, [x19], #0x4\n" + "tbz x9, #1, 123f\n" + "st1 { v31.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "st1 { v23.h }[2], [x20], #0x2\n" + "st1 { v24.h }[2], [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[6], [x26]\n" + "st1 { v16.b }[6], [x21]\n" + "st1 { v23.b }[6], [x20]\n" + "st1 { v24.b }[6], [x19]\n" + "b 126f\n" + "123:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x9, #0, 126f\n" + "st1 { v31.b }[4], [x26]\n" + "st1 { v16.b }[4], [x21]\n" + "st1 { v23.b }[4], [x20]\n" + "st1 { v24.b }[4], [x19]\n" + "b 126f\n" + "124:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x9, #1, 125f\n" + "str h31, [x26], #0x2\n" + "str h16, [x21], #0x2\n" + "str h23, [x20], #0x2\n" + "str h24, [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[2], [x26]\n" + "st1 { v16.b }[2], [x21]\n" + "st1 { v23.b }[2], [x20]\n" + "st1 { v24.b }[2], [x19]\n" + "b 126f\n" + "125:" // Height 4: Partial direct writeback: partial_1_0 + "str b31, [x26, #0x0]\n" + "str b16, [x21, #0x0]\n" + "str b23, [x20, #0x0]\n" + "str b24, [x19, #0x0]\n" + "126:" // Height 4: Partial direct writeback: Done + "b 128f\n" + "127:" // Height 4: Full writeback + "str q31, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q16, [x21, #0x0]\n" + "str q23, [x20, #0x0]\n" + "str q24, [x19, #0x0]\n" + "128:" // Height 4: Writeback done + "subs x9, x9, #0x10\n" + "bgt 98b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 130f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 129f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "129:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "130:" // Exit + + : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp index eb5bdfe55c..b028a8a9a3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -44,7 +44,8 @@ void a64_hybrid_s8qs_dot_6x16_a55( ARGLIST ); class cls_a64_hybrid_s8qs_dot_6x16 { public: - typedef int8_t operand_type; + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; typedef int8_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -70,16 +71,24 @@ public: return false; } - StdTransformsFixed transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 8.28 }; - default: - return { 27.5482 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 7.5301 }; + case CPUModel::A510: + return { 15.71 }; + default: + return { 27.5482 }; + case CPUModel::V1: + return { 52.09 }; + } } + + return { 1.0 }; } // Default to the generic kernel @@ -99,4 +108,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp index 6e3a00ed72..ba8a2ccb1d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp @@ -309,8 +309,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -693,8 +693,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -1193,8 +1193,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -1809,8 +1809,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -2541,8 +2541,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -3392,8 +3392,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp index 5a4df161aa..f503f40b0c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp @@ -287,16 +287,16 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v11.4s, v11.4s, v7.4s\n" "tbz %x[flags], #5, 17f\n" "and v4.16b, v8.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" "and v6.16b, v10.16b, v2.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v8.4s, v8.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v9.4s, v9.4s, v5.4s\n" "sqadd v10.4s, v10.4s, v6.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v11.4s, v11.4s, v7.4s\n" "17:" // Height 1: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" @@ -639,27 +639,27 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v15.4s, v15.4s, v7.4s\n" "tbz %x[flags], #5, 44f\n" "and v4.16b, v8.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" "and v6.16b, v10.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "and v7.16b, v11.16b, v3.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v8.4s, v8.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "and v4.16b, v12.16b, v0.16b\n" "sqadd v9.4s, v9.4s, v5.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v6.4s\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v12.16b, v0.16b\n" "and v5.16b, v13.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v11.4s, v11.4s, v7.4s\n" - "and v6.16b, v14.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v12.4s, v12.4s, v4.4s\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v6.16b, v14.16b, v2.16b\n" "and v7.16b, v15.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v5.4s\n" "sqadd v14.4s, v14.4s, v6.4s\n" "sqadd v15.4s, v15.4s, v7.4s\n" "44:" // Height 2: no shift correction @@ -676,8 +676,6 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "srshl v14.4s, v14.4s, v2.4s\n" - "srshl v15.4s, v15.4s, v3.4s\n" "add v8.4s, v8.4s, v4.4s\n" "add v9.4s, v9.4s, v4.4s\n" "add v10.4s, v10.4s, v4.4s\n" @@ -696,16 +694,18 @@ void a64_hybrid_s8qs_dot_6x16 ( "smax v11.4s, v11.4s, v5.4s\n" "smax v12.4s, v12.4s, v5.4s\n" "smax v13.4s, v13.4s, v5.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "add v14.4s, v14.4s, v4.4s\n" "add v15.4s, v15.4s, v4.4s\n" - "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" "smin v14.4s, v14.4s, v6.4s\n" "smin v15.4s, v15.4s, v6.4s\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" "smax v14.4s, v14.4s, v5.4s\n" "smax v15.4s, v15.4s, v5.4s\n" - "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v12.16b, v12.16b, v13.16b\n" "bge 53f\n" @@ -1105,37 +1105,37 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v19.4s, v19.4s, v7.4s\n" "tbz %x[flags], #5, 71f\n" "and v4.16b, v8.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" "and v6.16b, v10.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "and v7.16b, v11.16b, v3.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v8.4s, v8.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "and v4.16b, v12.16b, v0.16b\n" "sqadd v9.4s, v9.4s, v5.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v6.4s\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v12.16b, v0.16b\n" "and v5.16b, v13.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v11.4s, v11.4s, v7.4s\n" - "and v6.16b, v14.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v12.4s, v12.4s, v4.4s\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v13.4s, v13.4s, v5.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" "and v4.16b, v16.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v14.4s, v14.4s, v6.4s\n" - "and v5.16b, v17.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v15.4s, v15.4s, v7.4s\n" - "and v6.16b, v18.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v16.4s, v16.4s, v4.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" "and v7.16b, v19.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v5.4s\n" "sqadd v18.4s, v18.4s, v6.4s\n" @@ -1154,8 +1154,6 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "srshl v14.4s, v14.4s, v2.4s\n" - "srshl v15.4s, v15.4s, v3.4s\n" "add v8.4s, v8.4s, v4.4s\n" "add v9.4s, v9.4s, v4.4s\n" "add v10.4s, v10.4s, v4.4s\n" @@ -1174,31 +1172,33 @@ void a64_hybrid_s8qs_dot_6x16 ( "smax v11.4s, v11.4s, v5.4s\n" "smax v12.4s, v12.4s, v5.4s\n" "smax v13.4s, v13.4s, v5.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" "add v14.4s, v14.4s, v4.4s\n" "add v15.4s, v15.4s, v4.4s\n" - "srshl v16.4s, v16.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" "smin v14.4s, v14.4s, v6.4s\n" "smin v15.4s, v15.4s, v6.4s\n" - "srshl v17.4s, v17.4s, v1.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" "smax v14.4s, v14.4s, v5.4s\n" "smax v15.4s, v15.4s, v5.4s\n" - "add v16.4s, v16.4s, v4.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" "add v17.4s, v17.4s, v4.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" "add v18.4s, v18.4s, v4.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" "add v19.4s, v19.4s, v4.4s\n" - "uzp1 v8.8h, v8.8h, v9.8h\n" "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" "uzp1 v9.8h, v10.8h, v11.8h\n" + "smin v19.4s, v19.4s, v6.4s\n" "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v12.8h, v12.8h, v13.8h\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" @@ -1685,52 +1685,52 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v23.4s, v23.4s, v7.4s\n" "tbz %x[flags], #5, 98f\n" "and v4.16b, v8.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" "and v6.16b, v10.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "and v7.16b, v11.16b, v3.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v8.4s, v8.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "and v4.16b, v12.16b, v0.16b\n" "sqadd v9.4s, v9.4s, v5.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v6.4s\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v12.16b, v0.16b\n" "and v5.16b, v13.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v11.4s, v11.4s, v7.4s\n" - "and v6.16b, v14.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v12.4s, v12.4s, v4.4s\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v13.4s, v13.4s, v5.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" "and v4.16b, v16.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v14.4s, v14.4s, v6.4s\n" - "and v5.16b, v17.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v15.4s, v15.4s, v7.4s\n" - "and v6.16b, v18.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v16.4s, v16.4s, v4.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" "and v7.16b, v19.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v5.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v18.4s, v18.4s, v6.4s\n" - "and v5.16b, v21.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "and v5.16b, v21.16b, v1.16b\n" "and v6.16b, v22.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v20.4s, v20.4s, v4.4s\n" - "and v7.16b, v23.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v21.4s, v21.4s, v5.4s\n" "sqadd v22.4s, v22.4s, v6.4s\n" + "and v7.16b, v23.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v23.4s, v23.4s, v7.4s\n" "98:" // Height 4: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" @@ -1746,8 +1746,6 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "srshl v14.4s, v14.4s, v2.4s\n" - "srshl v15.4s, v15.4s, v3.4s\n" "add v8.4s, v8.4s, v4.4s\n" "add v9.4s, v9.4s, v4.4s\n" "add v10.4s, v10.4s, v4.4s\n" @@ -1766,45 +1764,47 @@ void a64_hybrid_s8qs_dot_6x16 ( "smax v11.4s, v11.4s, v5.4s\n" "smax v12.4s, v12.4s, v5.4s\n" "smax v13.4s, v13.4s, v5.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" "add v14.4s, v14.4s, v4.4s\n" "add v15.4s, v15.4s, v4.4s\n" - "srshl v16.4s, v16.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" "smin v14.4s, v14.4s, v6.4s\n" "smin v15.4s, v15.4s, v6.4s\n" - "srshl v17.4s, v17.4s, v1.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" "smax v14.4s, v14.4s, v5.4s\n" "smax v15.4s, v15.4s, v5.4s\n" - "add v16.4s, v16.4s, v4.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" "add v17.4s, v17.4s, v4.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" "add v18.4s, v18.4s, v4.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" "add v19.4s, v19.4s, v4.4s\n" - "srshl v20.4s, v20.4s, v0.4s\n" "smin v18.4s, v18.4s, v6.4s\n" + "add v20.4s, v20.4s, v4.4s\n" "smin v19.4s, v19.4s, v6.4s\n" - "srshl v21.4s, v21.4s, v1.4s\n" "smax v18.4s, v18.4s, v5.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" "smax v19.4s, v19.4s, v5.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" "srshl v22.4s, v22.4s, v2.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" "srshl v23.4s, v23.4s, v3.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" "add v22.4s, v22.4s, v4.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "uzp1 v8.8h, v8.8h, v9.8h\n" "smin v22.4s, v22.4s, v6.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" "smax v22.4s, v22.4s, v5.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v12.8h, v12.8h, v13.8h\n" "uzp1 v13.8h, v14.8h, v15.8h\n" @@ -2379,63 +2379,63 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v27.4s, v27.4s, v7.4s\n" "tbz %x[flags], #5, 125f\n" "and v4.16b, v8.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" "and v6.16b, v10.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "and v7.16b, v11.16b, v3.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v8.4s, v8.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "and v4.16b, v12.16b, v0.16b\n" "sqadd v9.4s, v9.4s, v5.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v6.4s\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v12.16b, v0.16b\n" "and v5.16b, v13.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v11.4s, v11.4s, v7.4s\n" - "and v6.16b, v14.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v12.4s, v12.4s, v4.4s\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v13.4s, v13.4s, v5.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" "and v4.16b, v16.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v14.4s, v14.4s, v6.4s\n" - "and v5.16b, v17.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v15.4s, v15.4s, v7.4s\n" - "and v6.16b, v18.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v16.4s, v16.4s, v4.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" "and v7.16b, v19.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v5.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v18.4s, v18.4s, v6.4s\n" - "and v5.16b, v21.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "and v5.16b, v21.16b, v1.16b\n" "and v6.16b, v22.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v20.4s, v20.4s, v4.4s\n" - "and v7.16b, v23.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v21.4s, v21.4s, v5.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v6.4s\n" + "and v7.16b, v23.16b, v3.16b\n" + "and v4.16b, v24.16b, v0.16b\n" "and v5.16b, v25.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v23.4s, v23.4s, v7.4s\n" - "and v6.16b, v26.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v24.4s, v24.4s, v4.4s\n" + "sqadd v25.4s, v25.4s, v5.4s\n" + "and v6.16b, v26.16b, v2.16b\n" "and v7.16b, v27.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v5.4s\n" "sqadd v26.4s, v26.4s, v6.4s\n" "sqadd v27.4s, v27.4s, v7.4s\n" "125:" // Height 5: no shift correction @@ -2452,8 +2452,6 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "srshl v14.4s, v14.4s, v2.4s\n" - "srshl v15.4s, v15.4s, v3.4s\n" "add v8.4s, v8.4s, v4.4s\n" "add v9.4s, v9.4s, v4.4s\n" "add v10.4s, v10.4s, v4.4s\n" @@ -2472,62 +2470,64 @@ void a64_hybrid_s8qs_dot_6x16 ( "smax v11.4s, v11.4s, v5.4s\n" "smax v12.4s, v12.4s, v5.4s\n" "smax v13.4s, v13.4s, v5.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" "add v14.4s, v14.4s, v4.4s\n" "add v15.4s, v15.4s, v4.4s\n" - "srshl v16.4s, v16.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" "smin v14.4s, v14.4s, v6.4s\n" "smin v15.4s, v15.4s, v6.4s\n" - "srshl v17.4s, v17.4s, v1.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" "smax v14.4s, v14.4s, v5.4s\n" "smax v15.4s, v15.4s, v5.4s\n" - "add v16.4s, v16.4s, v4.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" "add v17.4s, v17.4s, v4.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" "add v18.4s, v18.4s, v4.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" "add v19.4s, v19.4s, v4.4s\n" - "srshl v20.4s, v20.4s, v0.4s\n" "smin v18.4s, v18.4s, v6.4s\n" + "add v20.4s, v20.4s, v4.4s\n" "smin v19.4s, v19.4s, v6.4s\n" - "srshl v21.4s, v21.4s, v1.4s\n" "smax v18.4s, v18.4s, v5.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" "smax v19.4s, v19.4s, v5.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" "srshl v22.4s, v22.4s, v2.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" "srshl v23.4s, v23.4s, v3.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" "add v22.4s, v22.4s, v4.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "srshl v24.4s, v24.4s, v0.4s\n" "smin v22.4s, v22.4s, v6.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "srshl v25.4s, v25.4s, v1.4s\n" "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "srshl v26.4s, v26.4s, v2.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" + "add v25.4s, v25.4s, v4.4s\n" "add v26.4s, v26.4s, v4.4s\n" "add v27.4s, v27.4s, v4.4s\n" - "uzp1 v8.8h, v8.8h, v9.8h\n" + "smin v25.4s, v25.4s, v6.4s\n" "smin v26.4s, v26.4s, v6.4s\n" "smin v27.4s, v27.4s, v6.4s\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "smax v25.4s, v25.4s, v5.4s\n" "smax v26.4s, v26.4s, v5.4s\n" "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" @@ -3190,73 +3190,73 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v31.4s, v31.4s, v7.4s\n" "tbz %x[flags], #5, 152f\n" "and v4.16b, v8.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" "and v6.16b, v10.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "and v7.16b, v11.16b, v3.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v8.4s, v8.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "and v4.16b, v12.16b, v0.16b\n" "sqadd v9.4s, v9.4s, v5.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v6.4s\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v12.16b, v0.16b\n" "and v5.16b, v13.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v11.4s, v11.4s, v7.4s\n" - "and v6.16b, v14.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v12.4s, v12.4s, v4.4s\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v13.4s, v13.4s, v5.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" "and v4.16b, v16.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v14.4s, v14.4s, v6.4s\n" - "and v5.16b, v17.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v15.4s, v15.4s, v7.4s\n" - "and v6.16b, v18.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v16.4s, v16.4s, v4.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" "and v7.16b, v19.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v5.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v18.4s, v18.4s, v6.4s\n" - "and v5.16b, v21.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "and v5.16b, v21.16b, v1.16b\n" "and v6.16b, v22.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v20.4s, v20.4s, v4.4s\n" - "and v7.16b, v23.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v21.4s, v21.4s, v5.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v6.4s\n" + "and v7.16b, v23.16b, v3.16b\n" + "and v4.16b, v24.16b, v0.16b\n" "and v5.16b, v25.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v23.4s, v23.4s, v7.4s\n" - "and v6.16b, v26.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v24.4s, v24.4s, v4.4s\n" - "and v7.16b, v27.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v25.4s, v25.4s, v5.4s\n" + "and v6.16b, v26.16b, v2.16b\n" + "and v7.16b, v27.16b, v3.16b\n" "and v4.16b, v28.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v6.4s\n" - "and v5.16b, v29.16b, v1.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v27.4s, v27.4s, v7.4s\n" - "and v6.16b, v30.16b, v2.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v28.4s, v28.4s, v4.4s\n" + "and v5.16b, v29.16b, v1.16b\n" + "and v6.16b, v30.16b, v2.16b\n" "and v7.16b, v31.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v29.4s, v29.4s, v5.4s\n" "sqadd v30.4s, v30.4s, v6.4s\n" @@ -3275,8 +3275,6 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "srshl v14.4s, v14.4s, v2.4s\n" - "srshl v15.4s, v15.4s, v3.4s\n" "add v8.4s, v8.4s, v4.4s\n" "add v9.4s, v9.4s, v4.4s\n" "add v10.4s, v10.4s, v4.4s\n" @@ -3295,80 +3293,82 @@ void a64_hybrid_s8qs_dot_6x16 ( "smax v11.4s, v11.4s, v5.4s\n" "smax v12.4s, v12.4s, v5.4s\n" "smax v13.4s, v13.4s, v5.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" "add v14.4s, v14.4s, v4.4s\n" "add v15.4s, v15.4s, v4.4s\n" - "srshl v16.4s, v16.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" "smin v14.4s, v14.4s, v6.4s\n" "smin v15.4s, v15.4s, v6.4s\n" - "srshl v17.4s, v17.4s, v1.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" "smax v14.4s, v14.4s, v5.4s\n" "smax v15.4s, v15.4s, v5.4s\n" - "add v16.4s, v16.4s, v4.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" "add v17.4s, v17.4s, v4.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" "add v18.4s, v18.4s, v4.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" "add v19.4s, v19.4s, v4.4s\n" - "srshl v20.4s, v20.4s, v0.4s\n" "smin v18.4s, v18.4s, v6.4s\n" + "add v20.4s, v20.4s, v4.4s\n" "smin v19.4s, v19.4s, v6.4s\n" - "srshl v21.4s, v21.4s, v1.4s\n" "smax v18.4s, v18.4s, v5.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" "smax v19.4s, v19.4s, v5.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" "srshl v22.4s, v22.4s, v2.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" "srshl v23.4s, v23.4s, v3.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" "add v22.4s, v22.4s, v4.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "srshl v24.4s, v24.4s, v0.4s\n" "smin v22.4s, v22.4s, v6.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "srshl v25.4s, v25.4s, v1.4s\n" "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "srshl v26.4s, v26.4s, v2.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" + "add v25.4s, v25.4s, v4.4s\n" "add v26.4s, v26.4s, v4.4s\n" "add v27.4s, v27.4s, v4.4s\n" - "srshl v28.4s, v28.4s, v0.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" "smin v26.4s, v26.4s, v6.4s\n" "smin v27.4s, v27.4s, v6.4s\n" - "srshl v29.4s, v29.4s, v1.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" "smax v26.4s, v26.4s, v5.4s\n" "smax v27.4s, v27.4s, v5.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "srshl v29.4s, v29.4s, v1.4s\n" + "srshl v30.4s, v30.4s, v2.4s\n" + "srshl v31.4s, v31.4s, v3.4s\n" "add v28.4s, v28.4s, v4.4s\n" "add v29.4s, v29.4s, v4.4s\n" - "srshl v30.4s, v30.4s, v2.4s\n" + "add v30.4s, v30.4s, v4.4s\n" "smin v28.4s, v28.4s, v6.4s\n" "smin v29.4s, v29.4s, v6.4s\n" - "srshl v31.4s, v31.4s, v3.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" "smax v28.4s, v28.4s, v5.4s\n" "smax v29.4s, v29.4s, v5.4s\n" - "add v30.4s, v30.4s, v4.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" "add v31.4s, v31.4s, v4.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" "uzp1 v9.8h, v10.8h, v11.8h\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" "uzp1 v12.8h, v12.8h, v13.8h\n" "uzp1 v13.8h, v14.8h, v15.8h\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp new file mode 100644 index 0000000000..7eacdceae7 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_s8qs_mmla_6x16( ARGLIST ); + +class cls_a64_hybrid_s8qs_mmla_6x16 +{ +public: + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 50.42 }; + case CPUModel::A510: + return { 28.71 }; + case CPUModel::V1: + return { 77.72 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8qs_mmla_6x16; + cls_a64_hybrid_s8qs_mmla_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp new file mode 100644 index 0000000000..8924492e41 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp @@ -0,0 +1,3640 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_s8qs_mmla_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base +) +{ + struct KernelArgs { + const int32_t *multiplier_ptr = {}; + const int32_t *shift_ptr = {}; + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->per_channel_requant) { + flags |= 0x10; + ka.multiplier_ptr=qp->per_channel_muls + col_base; + ka.shift_ptr=qp->per_channel_right_shifts + col_base; + } + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 146f\n" + "cmp %x[M], #0x4\n" + "bgt 117f\n" + "beq 88f\n" + "cmp %x[M], #0x2\n" + "bgt 59f\n" + "beq 30f\n" + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "2:" // Height 1: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "3:" // Height 1: setup done + "mov x27, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x26, #0x10\n" + "blt 9f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 8f\n" + "7:" // Height 1: Multiply loop: Main loop head + "movi v2.16b, #0x0\n" + "ldr q7, [x28, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + "bge 7b\n" + "8:" // Height 1: Multiply loop: Single iteration only + "movi v2.16b, #0x0\n" + "ldr q7, [x28, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "9:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 16f\n" + "cmp x26, #0x8\n" + "blt 11f\n" + "10:" // Height 1: Multiply loop: Odd block loop + "movi v2.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x0]\n" + "cmp x26, #0x8\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + "ldr q7, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + "bge 10b\n" + "cbz x26, 16f\n" + "11:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #2, 13f\n" + "ldr s1, [x25], #0x4\n" + "tbz x26, #1, 12f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "tbz x26, #0, 15f\n" + "ld1 { v1.b }[6], [x25]\n" + "b 15f\n" + "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 15f\n" + "ld1 { v1.b }[4], [x25]\n" + "b 15f\n" + "13:" // Height 1: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 14f\n" + "ldr h1, [x25], #0x2\n" + "tbz x26, #0, 15f\n" + "ld1 { v1.b }[2], [x25]\n" + "b 15f\n" + "14:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "15:" // Height 1: Multiply loop: Ragged operand read: Done + "movi v2.16b, #0x0\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "16:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 4b\n" + "uzp1 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "uzp1 v9.2d, v9.2d, v13.2d\n" + "ldr q0, [x11, #0x0]\n" + "uzp1 v10.2d, v10.2d, v14.2d\n" + "ldr q1, [x11, #0x10]\n" + "uzp1 v11.2d, v11.2d, v15.2d\n" + "ldr q2, [x11, #0x20]\n" + "mov v15.16b, v8.16b\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "tbz %x[flags], #4, 17f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 18f\n" + "17:" // Height 1: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "18:" // Height 1: parameters loaded + "sqrdmulh v15.4s, v15.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "tbz %x[flags], #5, 19f\n" + "and v4.16b, v15.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "19:" // Height 1: no shift correction + "srshl v15.4s, v15.4s, v0.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add x24, %x[qp], %[minval]\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "uzp1 v15.8h, v15.8h, v9.8h\n" + "smax v11.4s, v11.4s, v5.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v15.16b, v15.16b, v9.16b\n" + "bge 28f\n" + "tbz x10, #3, 23f\n" + "str d15, [x9], #0x8\n" + "tbz x10, #2, 21f\n" + "st1 { v15.s }[2], [x9], #0x4\n" + "tbz x10, #1, 20f\n" + "st1 { v15.h }[6], [x9], #0x2\n" + "tbz x10, #0, 27f\n" + "st1 { v15.b }[14], [x9]\n" + "b 27f\n" + "20:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 27f\n" + "st1 { v15.b }[12], [x9]\n" + "b 27f\n" + "21:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 22f\n" + "st1 { v15.h }[4], [x9], #0x2\n" + "tbz x10, #0, 27f\n" + "st1 { v15.b }[10], [x9]\n" + "b 27f\n" + "22:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 27f\n" + "st1 { v15.b }[8], [x9]\n" + "b 27f\n" + "23:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 25f\n" + "str s15, [x9], #0x4\n" + "tbz x10, #1, 24f\n" + "st1 { v15.h }[2], [x9], #0x2\n" + "tbz x10, #0, 27f\n" + "st1 { v15.b }[6], [x9]\n" + "b 27f\n" + "24:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 27f\n" + "st1 { v15.b }[4], [x9]\n" + "b 27f\n" + "25:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 26f\n" + "str h15, [x9], #0x2\n" + "tbz x10, #0, 27f\n" + "st1 { v15.b }[2], [x9]\n" + "b 27f\n" + "26:" // Height 1: Partial direct writeback: partial_1_0 + "str b15, [x9, #0x0]\n" + "27:" // Height 1: Partial direct writeback: Done + "b 29f\n" + "28:" // Height 1: Full writeback + "str q15, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "29:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" + "bgt 2b\n" + "b 176f\n" + "30:" // Height 2 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "31:" // Height 2: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "32:" // Height 2: setup done + "mov x27, #0x0\n" + "33:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 34f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 35f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 35f\n" + "34:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "35:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 38f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 37f\n" + "36:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x24, x24, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + "sub x26, x26, #0x10\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + "bge 36b\n" + "37:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x28, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x25, x25, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + "add x24, x24, #0x10\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "38:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 45f\n" + "cmp x26, #0x8\n" + "blt 40f\n" + "39:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x0]\n" + "cmp x26, #0x8\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + "ldr q7, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + "bge 39b\n" + "cbz x26, 45f\n" + "40:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #2, 42f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "tbz x26, #1, 41f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "tbz x26, #0, 44f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "b 44f\n" + "41:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 44f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "b 44f\n" + "42:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 43f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "tbz x26, #0, 44f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "b 44f\n" + "43:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "44:" // Height 2: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "45:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 33b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x23, x9, x19\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "ldr q0, [x11, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "ldr q1, [x11, #0x10]\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "ldr q2, [x11, #0x20]\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "mov v15.16b, v7.16b\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "tbz %x[flags], #4, 46f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 47f\n" + "46:" // Height 2: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "47:" // Height 2: parameters loaded + "sqrdmulh v15.4s, v15.4s, v4.4s\n" + "sqrdmulh v12.4s, v12.4s, v5.4s\n" + "sqrdmulh v13.4s, v13.4s, v6.4s\n" + "sqrdmulh v14.4s, v14.4s, v7.4s\n" + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "tbz %x[flags], #5, 48f\n" + "and v4.16b, v15.16b, v0.16b\n" + "and v5.16b, v12.16b, v1.16b\n" + "and v6.16b, v13.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "sqadd v12.4s, v12.4s, v5.4s\n" + "sqadd v13.4s, v13.4s, v6.4s\n" + "and v7.16b, v14.16b, v3.16b\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v7.4s\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "48:" // Height 2: no shift correction + "srshl v15.4s, v15.4s, v0.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" + "srshl v12.4s, v12.4s, v1.4s\n" + "add x24, %x[qp], %[minval]\n" + "srshl v13.4s, v13.4s, v2.4s\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "srshl v14.4s, v14.4s, v3.4s\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "uzp1 v15.8h, v15.8h, v12.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "uzp1 v15.16b, v15.16b, v12.16b\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "bge 57f\n" + "tbz x10, #3, 52f\n" + "str d15, [x9], #0x8\n" + "str d8, [x23], #0x8\n" + "tbz x10, #2, 50f\n" + "st1 { v15.s }[2], [x9], #0x4\n" + "st1 { v8.s }[2], [x23], #0x4\n" + "tbz x10, #1, 49f\n" + "st1 { v15.h }[6], [x9], #0x2\n" + "st1 { v8.h }[6], [x23], #0x2\n" + "tbz x10, #0, 56f\n" + "st1 { v15.b }[14], [x9]\n" + "st1 { v8.b }[14], [x23]\n" + "b 56f\n" + "49:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 56f\n" + "st1 { v15.b }[12], [x9]\n" + "st1 { v8.b }[12], [x23]\n" + "b 56f\n" + "50:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 51f\n" + "st1 { v15.h }[4], [x9], #0x2\n" + "st1 { v8.h }[4], [x23], #0x2\n" + "tbz x10, #0, 56f\n" + "st1 { v15.b }[10], [x9]\n" + "st1 { v8.b }[10], [x23]\n" + "b 56f\n" + "51:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 56f\n" + "st1 { v15.b }[8], [x9]\n" + "st1 { v8.b }[8], [x23]\n" + "b 56f\n" + "52:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 54f\n" + "str s15, [x9], #0x4\n" + "str s8, [x23], #0x4\n" + "tbz x10, #1, 53f\n" + "st1 { v15.h }[2], [x9], #0x2\n" + "st1 { v8.h }[2], [x23], #0x2\n" + "tbz x10, #0, 56f\n" + "st1 { v15.b }[6], [x9]\n" + "st1 { v8.b }[6], [x23]\n" + "b 56f\n" + "53:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 56f\n" + "st1 { v15.b }[4], [x9]\n" + "st1 { v8.b }[4], [x23]\n" + "b 56f\n" + "54:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 55f\n" + "str h15, [x9], #0x2\n" + "str h8, [x23], #0x2\n" + "tbz x10, #0, 56f\n" + "st1 { v15.b }[2], [x9]\n" + "st1 { v8.b }[2], [x23]\n" + "b 56f\n" + "55:" // Height 2: Partial direct writeback: partial_1_0 + "str b15, [x9, #0x0]\n" + "str b8, [x23, #0x0]\n" + "56:" // Height 2: Partial direct writeback: Done + "b 58f\n" + "57:" // Height 2: Full writeback + "str q15, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "str q8, [x23, #0x0]\n" + "58:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 31b\n" + "b 176f\n" + "59:" // Height 3 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "60:" // Height 3: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "61:" // Height 3: setup done + "mov x27, #0x0\n" + "62:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 63f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 64f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 64f\n" + "63:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "64:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 67f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 66f\n" + "65:" // Height 3: Multiply loop: Main loop head + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "bge 65b\n" + "66:" // Height 3: Multiply loop: Single iteration only + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "67:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 74f\n" + "cmp x26, #0x8\n" + "blt 69f\n" + "68:" // Height 3: Multiply loop: Odd block loop + "movi v4.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x0]\n" + "ldr q7, [x28, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + "bge 68b\n" + "cbz x26, 74f\n" + "69:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #2, 71f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "tbz x26, #1, 70f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "tbz x26, #0, 73f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "b 73f\n" + "70:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 73f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "b 73f\n" + "71:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 72f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "tbz x26, #0, 73f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "b 73f\n" + "72:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "73:" // Height 3: Multiply loop: Ragged operand read: Done + "movi v4.16b, #0x0\n" + "ldr q7, [x28, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "74:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 62b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x23, x9, x19\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "ldr q0, [x11, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "ldr q1, [x11, #0x10]\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "ldr q2, [x11, #0x20]\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "mov v23.16b, v7.16b\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "tbz %x[flags], #4, 75f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 76f\n" + "75:" // Height 3: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "76:" // Height 3: parameters loaded + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v12.4s, v12.4s, v5.4s\n" + "sqrdmulh v13.4s, v13.4s, v6.4s\n" + "sqrdmulh v14.4s, v14.4s, v7.4s\n" + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "tbz %x[flags], #5, 77f\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v12.16b, v1.16b\n" + "and v6.16b, v13.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sqadd v12.4s, v12.4s, v5.4s\n" + "sqadd v13.4s, v13.4s, v6.4s\n" + "and v7.16b, v14.16b, v3.16b\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v7.4s\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "77:" // Height 3: no shift correction + "srshl v23.4s, v23.4s, v0.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" + "srshl v12.4s, v12.4s, v1.4s\n" + "add x24, %x[qp], %[minval]\n" + "srshl v13.4s, v13.4s, v2.4s\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "srshl v14.4s, v14.4s, v3.4s\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "uzp1 v23.8h, v23.8h, v12.8h\n" + "add v18.4s, v18.4s, v4.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 86f\n" + "tbz x10, #3, 81f\n" + "str d23, [x9], #0x8\n" + "str d8, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x10, #2, 79f\n" + "st1 { v23.s }[2], [x9], #0x4\n" + "st1 { v8.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "tbz x10, #1, 78f\n" + "st1 { v23.h }[6], [x9], #0x2\n" + "st1 { v8.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "tbz x10, #0, 85f\n" + "st1 { v23.b }[14], [x9]\n" + "st1 { v8.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "b 85f\n" + "78:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 85f\n" + "st1 { v23.b }[12], [x9]\n" + "st1 { v8.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "b 85f\n" + "79:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 80f\n" + "st1 { v23.h }[4], [x9], #0x2\n" + "st1 { v8.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "tbz x10, #0, 85f\n" + "st1 { v23.b }[10], [x9]\n" + "st1 { v8.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "b 85f\n" + "80:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 85f\n" + "st1 { v23.b }[8], [x9]\n" + "st1 { v8.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "b 85f\n" + "81:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 83f\n" + "str s23, [x9], #0x4\n" + "str s8, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "tbz x10, #1, 82f\n" + "st1 { v23.h }[2], [x9], #0x2\n" + "st1 { v8.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "tbz x10, #0, 85f\n" + "st1 { v23.b }[6], [x9]\n" + "st1 { v8.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "b 85f\n" + "82:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 85f\n" + "st1 { v23.b }[4], [x9]\n" + "st1 { v8.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "b 85f\n" + "83:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 84f\n" + "str h23, [x9], #0x2\n" + "str h8, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "tbz x10, #0, 85f\n" + "st1 { v23.b }[2], [x9]\n" + "st1 { v8.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "b 85f\n" + "84:" // Height 3: Partial direct writeback: partial_1_0 + "str b23, [x9, #0x0]\n" + "str b8, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "85:" // Height 3: Partial direct writeback: Done + "b 87f\n" + "86:" // Height 3: Full writeback + "str q23, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "str q8, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "87:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 60b\n" + "b 176f\n" + "88:" // Height 4 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "89:" // Height 4: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "90:" // Height 4: setup done + "mov x27, #0x0\n" + "91:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 92f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 93f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 93f\n" + "92:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "93:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 96f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 95f\n" + "94:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x22, x22, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + "cmp x26, #0x20\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "bge 94b\n" + "95:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "96:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 103f\n" + "cmp x26, #0x8\n" + "blt 98f\n" + "97:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x0]\n" + "ldr q7, [x28, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + "bge 97b\n" + "cbz x26, 103f\n" + "98:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #2, 100f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "tbz x26, #1, 99f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "tbz x26, #0, 102f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "b 102f\n" + "99:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 102f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "b 102f\n" + "100:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 101f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x26, #0, 102f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 102f\n" + "101:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "102:" // Height 4: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x28, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "103:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 91b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x23, x9, x19\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "ldr q0, [x11, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "ldr q1, [x11, #0x10]\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "ldr q2, [x11, #0x20]\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "mov v23.16b, v7.16b\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "tbz %x[flags], #4, 104f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 105f\n" + "104:" // Height 4: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "105:" // Height 4: parameters loaded + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v12.4s, v12.4s, v5.4s\n" + "sqrdmulh v13.4s, v13.4s, v6.4s\n" + "sqrdmulh v14.4s, v14.4s, v7.4s\n" + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v15.4s, v15.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v5.4s\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "tbz %x[flags], #5, 106f\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v12.16b, v1.16b\n" + "and v6.16b, v13.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sqadd v12.4s, v12.4s, v5.4s\n" + "sqadd v13.4s, v13.4s, v6.4s\n" + "and v7.16b, v14.16b, v3.16b\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v7.4s\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v15.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "and v5.16b, v20.16b, v1.16b\n" + "and v6.16b, v21.16b, v2.16b\n" + "and v7.16b, v22.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "106:" // Height 4: no shift correction + "srshl v23.4s, v23.4s, v0.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" + "srshl v12.4s, v12.4s, v1.4s\n" + "add x24, %x[qp], %[minval]\n" + "srshl v13.4s, v13.4s, v2.4s\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "srshl v14.4s, v14.4s, v3.4s\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v1.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v2.4s\n" + "srshl v22.4s, v22.4s, v3.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "uzp1 v23.8h, v23.8h, v12.8h\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v15.8h, v15.8h, v20.8h\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 115f\n" + "tbz x10, #3, 110f\n" + "str d23, [x9], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "tbz x10, #2, 108f\n" + "st1 { v23.s }[2], [x9], #0x4\n" + "st1 { v8.s }[2], [x23], #0x4\n" + "st1 { v15.s }[2], [x22], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "tbz x10, #1, 107f\n" + "st1 { v23.h }[6], [x9], #0x2\n" + "st1 { v8.h }[6], [x23], #0x2\n" + "st1 { v15.h }[6], [x22], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "tbz x10, #0, 114f\n" + "st1 { v23.b }[14], [x9]\n" + "st1 { v8.b }[14], [x23]\n" + "st1 { v15.b }[14], [x22]\n" + "st1 { v16.b }[14], [x21]\n" + "b 114f\n" + "107:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 114f\n" + "st1 { v23.b }[12], [x9]\n" + "st1 { v8.b }[12], [x23]\n" + "st1 { v15.b }[12], [x22]\n" + "st1 { v16.b }[12], [x21]\n" + "b 114f\n" + "108:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 109f\n" + "st1 { v23.h }[4], [x9], #0x2\n" + "st1 { v8.h }[4], [x23], #0x2\n" + "st1 { v15.h }[4], [x22], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "tbz x10, #0, 114f\n" + "st1 { v23.b }[10], [x9]\n" + "st1 { v8.b }[10], [x23]\n" + "st1 { v15.b }[10], [x22]\n" + "st1 { v16.b }[10], [x21]\n" + "b 114f\n" + "109:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 114f\n" + "st1 { v23.b }[8], [x9]\n" + "st1 { v8.b }[8], [x23]\n" + "st1 { v15.b }[8], [x22]\n" + "st1 { v16.b }[8], [x21]\n" + "b 114f\n" + "110:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 112f\n" + "str s23, [x9], #0x4\n" + "str s8, [x23], #0x4\n" + "str s15, [x22], #0x4\n" + "str s16, [x21], #0x4\n" + "tbz x10, #1, 111f\n" + "st1 { v23.h }[2], [x9], #0x2\n" + "st1 { v8.h }[2], [x23], #0x2\n" + "st1 { v15.h }[2], [x22], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "tbz x10, #0, 114f\n" + "st1 { v23.b }[6], [x9]\n" + "st1 { v8.b }[6], [x23]\n" + "st1 { v15.b }[6], [x22]\n" + "st1 { v16.b }[6], [x21]\n" + "b 114f\n" + "111:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 114f\n" + "st1 { v23.b }[4], [x9]\n" + "st1 { v8.b }[4], [x23]\n" + "st1 { v15.b }[4], [x22]\n" + "st1 { v16.b }[4], [x21]\n" + "b 114f\n" + "112:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 113f\n" + "str h23, [x9], #0x2\n" + "str h8, [x23], #0x2\n" + "str h15, [x22], #0x2\n" + "str h16, [x21], #0x2\n" + "tbz x10, #0, 114f\n" + "st1 { v23.b }[2], [x9]\n" + "st1 { v8.b }[2], [x23]\n" + "st1 { v15.b }[2], [x22]\n" + "st1 { v16.b }[2], [x21]\n" + "b 114f\n" + "113:" // Height 4: Partial direct writeback: partial_1_0 + "str b23, [x9, #0x0]\n" + "str b8, [x23, #0x0]\n" + "str b15, [x22, #0x0]\n" + "str b16, [x21, #0x0]\n" + "114:" // Height 4: Partial direct writeback: Done + "b 116f\n" + "115:" // Height 4: Full writeback + "str q23, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "str q8, [x23, #0x0]\n" + "str q15, [x22, #0x0]\n" + "str q16, [x21, #0x0]\n" + "116:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 89b\n" + "b 176f\n" + "117:" // Height 5 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "118:" // Height 5: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "119:" // Height 5: setup done + "mov x27, #0x0\n" + "120:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 121f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 122f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 122f\n" + "121:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "122:" // Height 5: input setup done + "cmp x26, #0x10\n" + "blt 125f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 124f\n" + "123:" // Height 5: Multiply loop: Main loop head + "movi v6.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x21, x21, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x28, #0x10]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "bge 123b\n" + "124:" // Height 5: Multiply loop: Single iteration only + "movi v6.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x28, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "125:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 132f\n" + "cmp x26, #0x8\n" + "blt 127f\n" + "126:" // Height 5: Multiply loop: Odd block loop + "movi v7.4s, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr q6, [x28, #0x0]\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x28, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + "bge 126b\n" + "cbz x26, 132f\n" + "127:" // Height 5: Multiply loop: Skip odd blocks + "tbz x26, #2, 129f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "tbz x26, #1, 128f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "ld1 { v5.h }[2], [x21], #0x2\n" + "tbz x26, #0, 131f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "ld1 { v5.b }[6], [x21]\n" + "b 131f\n" + "128:" // Height 5: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 131f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "ld1 { v5.b }[4], [x21]\n" + "b 131f\n" + "129:" // Height 5: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 130f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "tbz x26, #0, 131f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x21]\n" + "b 131f\n" + "130:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x21, #0x0]\n" + "131:" // Height 5: Multiply loop: Ragged operand read: Done + "movi v6.4s, #0x0\n" + "ldr q7, [x28, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x28, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "132:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 120b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x23, x9, x19\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "ldr q0, [x11, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "ldr q1, [x11, #0x10]\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "ldr q2, [x11, #0x20]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "mov v31.16b, v7.16b\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "tbz %x[flags], #4, 133f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 134f\n" + "133:" // Height 5: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "134:" // Height 5: parameters loaded + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "sqrdmulh v12.4s, v12.4s, v5.4s\n" + "sqrdmulh v13.4s, v13.4s, v6.4s\n" + "sqrdmulh v14.4s, v14.4s, v7.4s\n" + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v15.4s, v15.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v5.4s\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v5.4s\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "sqrdmulh v27.4s, v27.4s, v7.4s\n" + "tbz %x[flags], #5, 135f\n" + "and v4.16b, v31.16b, v0.16b\n" + "and v5.16b, v12.16b, v1.16b\n" + "and v6.16b, v13.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v4.4s\n" + "sqadd v12.4s, v12.4s, v5.4s\n" + "sqadd v13.4s, v13.4s, v6.4s\n" + "and v7.16b, v14.16b, v3.16b\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v7.4s\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v15.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "and v5.16b, v20.16b, v1.16b\n" + "and v6.16b, v21.16b, v2.16b\n" + "and v7.16b, v22.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "and v7.16b, v19.16b, v3.16b\n" + "and v4.16b, v24.16b, v0.16b\n" + "and v5.16b, v25.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "sqadd v25.4s, v25.4s, v5.4s\n" + "and v6.16b, v26.16b, v2.16b\n" + "and v7.16b, v27.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v7.4s\n" + "135:" // Height 5: no shift correction + "srshl v31.4s, v31.4s, v0.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" + "srshl v12.4s, v12.4s, v1.4s\n" + "add x24, %x[qp], %[minval]\n" + "srshl v13.4s, v13.4s, v2.4s\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "srshl v14.4s, v14.4s, v3.4s\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v1.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v2.4s\n" + "srshl v22.4s, v22.4s, v3.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" + "srshl v27.4s, v27.4s, v3.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v31.8h, v31.8h, v12.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v15.8h, v15.8h, v20.8h\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 144f\n" + "tbz x10, #3, 139f\n" + "str d31, [x9], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x10, #2, 137f\n" + "st1 { v31.s }[2], [x9], #0x4\n" + "st1 { v8.s }[2], [x23], #0x4\n" + "st1 { v15.s }[2], [x22], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x10, #1, 136f\n" + "st1 { v31.h }[6], [x9], #0x2\n" + "st1 { v8.h }[6], [x23], #0x2\n" + "st1 { v15.h }[6], [x22], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x10, #0, 143f\n" + "st1 { v31.b }[14], [x9]\n" + "st1 { v8.b }[14], [x23]\n" + "st1 { v15.b }[14], [x22]\n" + "st1 { v16.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 143f\n" + "136:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x10, #0, 143f\n" + "st1 { v31.b }[12], [x9]\n" + "st1 { v8.b }[12], [x23]\n" + "st1 { v15.b }[12], [x22]\n" + "st1 { v16.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 143f\n" + "137:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x10, #1, 138f\n" + "st1 { v31.h }[4], [x9], #0x2\n" + "st1 { v8.h }[4], [x23], #0x2\n" + "st1 { v15.h }[4], [x22], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x10, #0, 143f\n" + "st1 { v31.b }[10], [x9]\n" + "st1 { v8.b }[10], [x23]\n" + "st1 { v15.b }[10], [x22]\n" + "st1 { v16.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 143f\n" + "138:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x10, #0, 143f\n" + "st1 { v31.b }[8], [x9]\n" + "st1 { v8.b }[8], [x23]\n" + "st1 { v15.b }[8], [x22]\n" + "st1 { v16.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 143f\n" + "139:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x10, #2, 141f\n" + "str s31, [x9], #0x4\n" + "str s8, [x23], #0x4\n" + "str s15, [x22], #0x4\n" + "str s16, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x10, #1, 140f\n" + "st1 { v31.h }[2], [x9], #0x2\n" + "st1 { v8.h }[2], [x23], #0x2\n" + "st1 { v15.h }[2], [x22], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x10, #0, 143f\n" + "st1 { v31.b }[6], [x9]\n" + "st1 { v8.b }[6], [x23]\n" + "st1 { v15.b }[6], [x22]\n" + "st1 { v16.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 143f\n" + "140:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x10, #0, 143f\n" + "st1 { v31.b }[4], [x9]\n" + "st1 { v8.b }[4], [x23]\n" + "st1 { v15.b }[4], [x22]\n" + "st1 { v16.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 143f\n" + "141:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x10, #1, 142f\n" + "str h31, [x9], #0x2\n" + "str h8, [x23], #0x2\n" + "str h15, [x22], #0x2\n" + "str h16, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x10, #0, 143f\n" + "st1 { v31.b }[2], [x9]\n" + "st1 { v8.b }[2], [x23]\n" + "st1 { v15.b }[2], [x22]\n" + "st1 { v16.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 143f\n" + "142:" // Height 5: Partial direct writeback: partial_1_0 + "str b31, [x9, #0x0]\n" + "str b8, [x23, #0x0]\n" + "str b15, [x22, #0x0]\n" + "str b16, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "143:" // Height 5: Partial direct writeback: Done + "b 145f\n" + "144:" // Height 5: Full writeback + "str q31, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "str q8, [x23, #0x0]\n" + "str q15, [x22, #0x0]\n" + "str q16, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "145:" // Height 5: Writeback done + "subs x10, x10, #0x10\n" + "bgt 118b\n" + "b 176f\n" + "146:" // Height 6 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x20, #0x6\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "147:" // Height 6: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "148:" // Height 6: setup done + "mov x27, #0x0\n" + "149:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 150f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 151f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 151f\n" + "150:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "151:" // Height 6: input setup done + "cmp x26, #0x10\n" + "blt 154f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 153f\n" + "152:" // Height 6: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x20, x20, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "bge 152b\n" + "153:" // Height 6: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x28, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x21, x21, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x20, x20, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x28, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "154:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 161f\n" + "cmp x26, #0x8\n" + "blt 156f\n" + "155:" // Height 6: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr d7, [x20], #0x8\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q6, [x28, #0x0]\n" + "ldr q7, [x28, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + "bge 155b\n" + "cbz x26, 161f\n" + "156:" // Height 6: Multiply loop: Skip odd blocks + "tbz x26, #2, 158f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "ldr s6, [x20], #0x4\n" + "tbz x26, #1, 157f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "ld1 { v5.h }[2], [x21], #0x2\n" + "ld1 { v6.h }[2], [x20], #0x2\n" + "tbz x26, #0, 160f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "ld1 { v5.b }[6], [x21]\n" + "ld1 { v6.b }[6], [x20]\n" + "b 160f\n" + "157:" // Height 6: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 160f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "ld1 { v5.b }[4], [x21]\n" + "ld1 { v6.b }[4], [x20]\n" + "b 160f\n" + "158:" // Height 6: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 159f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "ldr h6, [x20], #0x2\n" + "tbz x26, #0, 160f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x21]\n" + "ld1 { v6.b }[2], [x20]\n" + "b 160f\n" + "159:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x21, #0x0]\n" + "ldr b6, [x20, #0x0]\n" + "160:" // Height 6: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x28, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x28, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x28, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "161:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 149b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x23, x9, x19\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "ldr q0, [x11, #0x0]\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x19, #0x0]\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "ldr q1, [x11, #0x10]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "ldr q2, [x11, #0x20]\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "mov v31.16b, v7.16b\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v28.4s, v28.4s, v1.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "tbz %x[flags], #4, 162f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 163f\n" + "162:" // Height 6: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "163:" // Height 6: parameters loaded + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "sqrdmulh v12.4s, v12.4s, v5.4s\n" + "sqrdmulh v13.4s, v13.4s, v6.4s\n" + "sqrdmulh v14.4s, v14.4s, v7.4s\n" + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v15.4s, v15.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v5.4s\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v28.4s, v28.4s, v5.4s\n" + "sqrdmulh v29.4s, v29.4s, v6.4s\n" + "sqrdmulh v30.4s, v30.4s, v7.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v5.4s\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "sqrdmulh v27.4s, v27.4s, v7.4s\n" + "tbz %x[flags], #5, 164f\n" + "and v4.16b, v31.16b, v0.16b\n" + "and v5.16b, v12.16b, v1.16b\n" + "and v6.16b, v13.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v4.4s\n" + "sqadd v12.4s, v12.4s, v5.4s\n" + "sqadd v13.4s, v13.4s, v6.4s\n" + "and v7.16b, v14.16b, v3.16b\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v7.4s\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "and v4.16b, v15.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "and v5.16b, v20.16b, v1.16b\n" + "and v6.16b, v21.16b, v2.16b\n" + "and v7.16b, v22.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v1.16b\n" + "and v6.16b, v18.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "and v7.16b, v19.16b, v3.16b\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v28.16b, v1.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sqadd v28.4s, v28.4s, v5.4s\n" + "and v6.16b, v29.16b, v2.16b\n" + "and v7.16b, v30.16b, v3.16b\n" + "and v4.16b, v24.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v29.4s, v29.4s, v6.4s\n" + "sqadd v30.4s, v30.4s, v7.4s\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "and v5.16b, v25.16b, v1.16b\n" + "and v6.16b, v26.16b, v2.16b\n" + "and v7.16b, v27.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v5.4s\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v7.4s\n" + "164:" // Height 6: no shift correction + "srshl v31.4s, v31.4s, v0.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" + "srshl v12.4s, v12.4s, v1.4s\n" + "add x24, %x[qp], %[minval]\n" + "srshl v13.4s, v13.4s, v2.4s\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "srshl v14.4s, v14.4s, v3.4s\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v1.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v2.4s\n" + "srshl v22.4s, v22.4s, v3.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "srshl v28.4s, v28.4s, v1.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "srshl v29.4s, v29.4s, v2.4s\n" + "srshl v30.4s, v30.4s, v3.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" + "srshl v27.4s, v27.4s, v3.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "uzp1 v31.8h, v31.8h, v12.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" + "smin v27.4s, v27.4s, v6.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v15.8h, v15.8h, v20.8h\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.8h, v23.8h, v28.8h\n" + "uzp1 v28.8h, v29.8h, v30.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v23.16b, v23.16b, v28.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 173f\n" + "tbz x10, #3, 168f\n" + "str d31, [x9], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "str d23, [x20], #0x8\n" + "str d24, [x19], #0x8\n" + "tbz x10, #2, 166f\n" + "st1 { v31.s }[2], [x9], #0x4\n" + "st1 { v8.s }[2], [x23], #0x4\n" + "st1 { v15.s }[2], [x22], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "st1 { v23.s }[2], [x20], #0x4\n" + "st1 { v24.s }[2], [x19], #0x4\n" + "tbz x10, #1, 165f\n" + "st1 { v31.h }[6], [x9], #0x2\n" + "st1 { v8.h }[6], [x23], #0x2\n" + "st1 { v15.h }[6], [x22], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "st1 { v23.h }[6], [x20], #0x2\n" + "st1 { v24.h }[6], [x19], #0x2\n" + "tbz x10, #0, 172f\n" + "st1 { v31.b }[14], [x9]\n" + "st1 { v8.b }[14], [x23]\n" + "st1 { v15.b }[14], [x22]\n" + "st1 { v16.b }[14], [x21]\n" + "st1 { v23.b }[14], [x20]\n" + "st1 { v24.b }[14], [x19]\n" + "b 172f\n" + "165:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x10, #0, 172f\n" + "st1 { v31.b }[12], [x9]\n" + "st1 { v8.b }[12], [x23]\n" + "st1 { v15.b }[12], [x22]\n" + "st1 { v16.b }[12], [x21]\n" + "st1 { v23.b }[12], [x20]\n" + "st1 { v24.b }[12], [x19]\n" + "b 172f\n" + "166:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x10, #1, 167f\n" + "st1 { v31.h }[4], [x9], #0x2\n" + "st1 { v8.h }[4], [x23], #0x2\n" + "st1 { v15.h }[4], [x22], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "st1 { v23.h }[4], [x20], #0x2\n" + "st1 { v24.h }[4], [x19], #0x2\n" + "tbz x10, #0, 172f\n" + "st1 { v31.b }[10], [x9]\n" + "st1 { v8.b }[10], [x23]\n" + "st1 { v15.b }[10], [x22]\n" + "st1 { v16.b }[10], [x21]\n" + "st1 { v23.b }[10], [x20]\n" + "st1 { v24.b }[10], [x19]\n" + "b 172f\n" + "167:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x10, #0, 172f\n" + "st1 { v31.b }[8], [x9]\n" + "st1 { v8.b }[8], [x23]\n" + "st1 { v15.b }[8], [x22]\n" + "st1 { v16.b }[8], [x21]\n" + "st1 { v23.b }[8], [x20]\n" + "st1 { v24.b }[8], [x19]\n" + "b 172f\n" + "168:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x10, #2, 170f\n" + "str s31, [x9], #0x4\n" + "str s8, [x23], #0x4\n" + "str s15, [x22], #0x4\n" + "str s16, [x21], #0x4\n" + "str s23, [x20], #0x4\n" + "str s24, [x19], #0x4\n" + "tbz x10, #1, 169f\n" + "st1 { v31.h }[2], [x9], #0x2\n" + "st1 { v8.h }[2], [x23], #0x2\n" + "st1 { v15.h }[2], [x22], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "st1 { v23.h }[2], [x20], #0x2\n" + "st1 { v24.h }[2], [x19], #0x2\n" + "tbz x10, #0, 172f\n" + "st1 { v31.b }[6], [x9]\n" + "st1 { v8.b }[6], [x23]\n" + "st1 { v15.b }[6], [x22]\n" + "st1 { v16.b }[6], [x21]\n" + "st1 { v23.b }[6], [x20]\n" + "st1 { v24.b }[6], [x19]\n" + "b 172f\n" + "169:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x10, #0, 172f\n" + "st1 { v31.b }[4], [x9]\n" + "st1 { v8.b }[4], [x23]\n" + "st1 { v15.b }[4], [x22]\n" + "st1 { v16.b }[4], [x21]\n" + "st1 { v23.b }[4], [x20]\n" + "st1 { v24.b }[4], [x19]\n" + "b 172f\n" + "170:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x10, #1, 171f\n" + "str h31, [x9], #0x2\n" + "str h8, [x23], #0x2\n" + "str h15, [x22], #0x2\n" + "str h16, [x21], #0x2\n" + "str h23, [x20], #0x2\n" + "str h24, [x19], #0x2\n" + "tbz x10, #0, 172f\n" + "st1 { v31.b }[2], [x9]\n" + "st1 { v8.b }[2], [x23]\n" + "st1 { v15.b }[2], [x22]\n" + "st1 { v16.b }[2], [x21]\n" + "st1 { v23.b }[2], [x20]\n" + "st1 { v24.b }[2], [x19]\n" + "b 172f\n" + "171:" // Height 6: Partial direct writeback: partial_1_0 + "str b31, [x9, #0x0]\n" + "str b8, [x23, #0x0]\n" + "str b15, [x22, #0x0]\n" + "str b16, [x21, #0x0]\n" + "str b23, [x20, #0x0]\n" + "str b24, [x19, #0x0]\n" + "172:" // Height 6: Partial direct writeback: Done + "b 174f\n" + "173:" // Height 6: Full writeback + "str q31, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "str q8, [x23, #0x0]\n" + "str q15, [x22, #0x0]\n" + "str q16, [x21, #0x0]\n" + "str q23, [x20, #0x0]\n" + "str q24, [x19, #0x0]\n" + "174:" // Height 6: Writeback done + "subs x10, x10, #0x10\n" + "bgt 147b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 176f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 175f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "175:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "176:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp index 759a78a413..d91c69b8a0 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -44,7 +44,8 @@ void a64_hybrid_s8s32_dot_6x16_a55( ARGLIST ); class cls_a64_hybrid_s8s32_dot_6x16 { public: - typedef int8_t operand_type; + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; typedef int32_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -70,16 +71,35 @@ public: return true; } - StdTransformsFixed transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 12.667, 2.0799, 0.2279 }; - default: - return { 29.6736, 11.4025, 0.5591 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.65 }; + case CPUModel::A510: + return { 15.87 }; + case CPUModel::V1: + return { 54.50 }; + } } + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 9.5238, 2.0799, 0.2279 }; + default: + return { 29.6736, 11.4025, 0.5591 }; + case CPUModel::A510: + return { 16.66, 3.92, 0.48 }; + case CPUModel::V1: + return { 55.40, 19.21, 0.93 }; + } + } + + return { 1.0 }; } // Default to the generic kernel @@ -99,4 +119,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp index 3566027a50..e47295a766 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp @@ -1819,8 +1819,8 @@ void a64_hybrid_s8s32_dot_6x16 ( "ld1 { v22.4s }, [x21], #0x10\n" "ld1 { v26.4s }, [x20], #0x10\n" "tbz x10, #1, 139f\n" - "mov x24, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x24, #0x38\n" "ldr d15, [x23], #0x8\n" "ldr d19, [x22], #0x8\n" "ldr d23, [x21], #0x8\n" @@ -1873,8 +1873,8 @@ void a64_hybrid_s8s32_dot_6x16 ( "ld1 { v20.4s }, [x21], #0x10\n" "ld1 { v24.4s }, [x20], #0x10\n" "tbz x10, #1, 143f\n" - "mov x24, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x24, #0x18\n" "ldr d13, [x23], #0x8\n" "ldr d17, [x22], #0x8\n" "ldr d21, [x21], #0x8\n" @@ -2487,12 +2487,12 @@ void a64_hybrid_s8s32_dot_6x16 ( "ld1 { v16.4s }, [x22], #0x10\n" "ld1 { v20.4s }, [x21], #0x10\n" "ld1 { v24.4s }, [x20], #0x10\n" - "ld1 { v28.4s }, [x19], #0x10\n" "ld1 { v9.4s }, [x28], #0x10\n" "ld1 { v13.4s }, [x23], #0x10\n" "ld1 { v17.4s }, [x22], #0x10\n" "ld1 { v21.4s }, [x21], #0x10\n" "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" "ld1 { v29.4s }, [x19], #0x10\n" "tbz x10, #2, 174f\n" "ld1 { v10.4s }, [x28], #0x10\n" @@ -2502,8 +2502,8 @@ void a64_hybrid_s8s32_dot_6x16 ( "ld1 { v26.4s }, [x20], #0x10\n" "ld1 { v30.4s }, [x19], #0x10\n" "tbz x10, #1, 173f\n" - "mov x24, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x24, #0x38\n" "ldr d15, [x23], #0x8\n" "ldr d19, [x22], #0x8\n" "ldr d23, [x21], #0x8\n" @@ -2563,8 +2563,8 @@ void a64_hybrid_s8s32_dot_6x16 ( "ld1 { v24.4s }, [x20], #0x10\n" "ld1 { v28.4s }, [x19], #0x10\n" "tbz x10, #1, 177f\n" - "mov x24, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x24, #0x18\n" "ldr d13, [x23], #0x8\n" "ldr d17, [x22], #0x8\n" "ldr d21, [x21], #0x8\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp new file mode 100644 index 0000000000..50ccb6fa3d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const int32_t *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_s8s32_mmla_6x16( ARGLIST ); + +class cls_a64_hybrid_s8s32_mmla_6x16 +{ +public: + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 54.98 }; + case CPUModel::A510: + return { 30.30 }; + case CPUModel::V1: + return { 83.71 }; + } + } + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 55.27, 15.25, 0.62 }; + case CPUModel::A510: + return { 33.62, 3.92, 0.48 }; + case CPUModel::V1: + return { 86.36, 19.25, 0.92 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8s32_mmla_6x16; + cls_a64_hybrid_s8s32_mmla_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp new file mode 100644 index 0000000000..a9f6b06ae1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp @@ -0,0 +1,3463 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_s8s32_mmla_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const int32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 186f\n" + "cmp %x[M], #0x4\n" + "bgt 149f\n" + "beq 112f\n" + "cmp %x[M], #0x2\n" + "bgt 75f\n" + "beq 38f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "tbz %x[flags], #0, 13f\n" + "cmp x10, #0x10\n" + "bge 11f\n" + "tbz x10, #3, 6f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #2, 4f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "tbz x10, #1, 3f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v16.s }[2], [x28]\n" + "b 10f\n" + "3:" // Height 1: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 10f\n" + "ldr s16, [x28, #0x0]\n" + "b 10f\n" + "4:" // Height 1: Partial accumulate: partial_2_8 + "tbz x10, #1, 5f\n" + "ldr d11, [x28], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 10f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 10f\n" + "5:" // Height 1: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 10f\n" + "ldr s11, [x28, #0x0]\n" + "b 10f\n" + "6:" // Height 1: Partial accumulate: partial_4_0 + "tbz x10, #2, 8f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #1, 7f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x18\n" + "tbz x10, #0, 10f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 10f\n" + "7:" // Height 1: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 10f\n" + "ldr s10, [x28, #0x0]\n" + "b 10f\n" + "8:" // Height 1: Partial accumulate: partial_2_0 + "tbz x10, #1, 9f\n" + "ldr d9, [x28], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 10f\n" + "9:" // Height 1: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "10:" // Height 1: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 12f\n" + "11:" // Height 1: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "12:" // Height 1: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 14f\n" + "13:" // Height 1: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "14:" // Height 1: setup done + "mov x27, #0x0\n" + "15:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x26, #0x10\n" + "blt 20f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head + "movi v2.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only + "movi v2.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 27f\n" + "cmp x26, #0x8\n" + "blt 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "movi v2.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x0]\n" + "cmp x26, #0x8\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + "ldr q7, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + "bge 21b\n" + "cbz x26, 27f\n" + "22:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #2, 24f\n" + "ldr s1, [x25], #0x4\n" + "tbz x26, #1, 23f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "tbz x26, #0, 26f\n" + "ld1 { v1.b }[6], [x25]\n" + "b 26f\n" + "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 26f\n" + "ld1 { v1.b }[4], [x25]\n" + "b 26f\n" + "24:" // Height 1: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 25f\n" + "ldr h1, [x25], #0x2\n" + "tbz x26, #0, 26f\n" + "ld1 { v1.b }[2], [x25]\n" + "b 26f\n" + "25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "26:" // Height 1: Multiply loop: Ragged operand read: Done + "movi v2.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "27:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 15b\n" + "uzp1 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "uzp1 v9.2d, v9.2d, v13.2d\n" + "cmp x10, #0x10\n" + "uzp1 v10.2d, v10.2d, v14.2d\n" + "uzp1 v11.2d, v11.2d, v15.2d\n" + "bge 36f\n" + "tbz x10, #3, 31f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #2, 29f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #1, 28f\n" + "str d11, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v11.s }[2], [x28]\n" + "b 35f\n" + "28:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 35f\n" + "str s11, [x28, #0x0]\n" + "b 35f\n" + "29:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 30f\n" + "str d10, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v10.s }[2], [x28]\n" + "b 35f\n" + "30:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 35f\n" + "str s10, [x28, #0x0]\n" + "b 35f\n" + "31:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 33f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x10, #1, 32f\n" + "str d9, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v9.s }[2], [x28]\n" + "b 35f\n" + "32:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 35f\n" + "str s9, [x28, #0x0]\n" + "b 35f\n" + "33:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 34f\n" + "str d8, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v8.s }[2], [x28]\n" + "b 35f\n" + "34:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "35:" // Height 1: Partial direct writeback: Done + "b 37f\n" + "36:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "37:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" + "bgt 2b\n" + "b 224f\n" + "38:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "39:" // Height 2: Column loop + "tbz %x[flags], #0, 50f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "bge 48f\n" + "tbz x10, #3, 43f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "tbz x10, #2, 41f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "tbz x10, #1, 40f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "tbz x10, #0, 47f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "b 47f\n" + "40:" // Height 2: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 47f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "b 47f\n" + "41:" // Height 2: Partial accumulate: partial_2_8 + "tbz x10, #1, 42f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 47f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "b 47f\n" + "42:" // Height 2: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 47f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "b 47f\n" + "43:" // Height 2: Partial accumulate: partial_4_0 + "tbz x10, #2, 45f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "tbz x10, #1, 44f\n" + "mov x24, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "tbz x10, #0, 47f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "b 47f\n" + "44:" // Height 2: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 47f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "b 47f\n" + "45:" // Height 2: Partial accumulate: partial_2_0 + "tbz x10, #1, 46f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 47f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "b 47f\n" + "46:" // Height 2: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "47:" // Height 2: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 49f\n" + "48:" // Height 2: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "49:" // Height 2: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 51f\n" + "50:" // Height 2: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "51:" // Height 2: setup done + "mov x27, #0x0\n" + "52:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 53f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 54f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 54f\n" + "53:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "54:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 57f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 56f\n" + "55:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x24, x24, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "sub x26, x26, #0x10\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + "bge 55b\n" + "56:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x25, x25, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "add x24, x24, #0x10\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "57:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 64f\n" + "cmp x26, #0x8\n" + "blt 59f\n" + "58:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x0]\n" + "cmp x26, #0x8\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + "ldr q7, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + "bge 58b\n" + "cbz x26, 64f\n" + "59:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #2, 61f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "tbz x26, #1, 60f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "tbz x26, #0, 63f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "b 63f\n" + "60:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 63f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "b 63f\n" + "61:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 62f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "tbz x26, #0, 63f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "b 63f\n" + "62:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "63:" // Height 2: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "64:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 52b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "bge 73f\n" + "tbz x10, #3, 68f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "tbz x10, #2, 66f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "tbz x10, #1, 65f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "b 72f\n" + "65:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 72f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "b 72f\n" + "66:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 67f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "b 72f\n" + "67:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 72f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "b 72f\n" + "68:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 70f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "tbz x10, #1, 69f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "b 72f\n" + "69:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 72f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "b 72f\n" + "70:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 71f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "b 72f\n" + "71:" // Height 2: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "72:" // Height 2: Partial direct writeback: Done + "b 74f\n" + "73:" // Height 2: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "74:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 39b\n" + "b 224f\n" + "75:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "76:" // Height 3: Column loop + "tbz %x[flags], #0, 87f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 85f\n" + "tbz x10, #3, 80f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #2, 78f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "tbz x10, #1, 77f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "b 84f\n" + "77:" // Height 3: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 84f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "b 84f\n" + "78:" // Height 3: Partial accumulate: partial_2_8 + "tbz x10, #1, 79f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 84f\n" + "79:" // Height 3: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 84f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 84f\n" + "80:" // Height 3: Partial accumulate: partial_4_0 + "tbz x10, #2, 82f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #1, 81f\n" + "mov x24, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 84f\n" + "81:" // Height 3: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 84f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 84f\n" + "82:" // Height 3: Partial accumulate: partial_2_0 + "tbz x10, #1, 83f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 84f\n" + "83:" // Height 3: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "84:" // Height 3: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 86f\n" + "85:" // Height 3: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "86:" // Height 3: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 88f\n" + "87:" // Height 3: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "88:" // Height 3: setup done + "mov x27, #0x0\n" + "89:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 90f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 91f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 91f\n" + "90:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "91:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 94f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 93f\n" + "92:" // Height 3: Multiply loop: Main loop head + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "bge 92b\n" + "93:" // Height 3: Multiply loop: Single iteration only + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "94:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 101f\n" + "cmp x26, #0x8\n" + "blt 96f\n" + "95:" // Height 3: Multiply loop: Odd block loop + "movi v4.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + "bge 95b\n" + "cbz x26, 101f\n" + "96:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #2, 98f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "tbz x26, #1, 97f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "tbz x26, #0, 100f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "b 100f\n" + "97:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 100f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "b 100f\n" + "98:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 99f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "tbz x26, #0, 100f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "b 100f\n" + "99:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "100:" // Height 3: Multiply loop: Ragged operand read: Done + "movi v4.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "101:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 89b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "bge 110f\n" + "tbz x10, #3, 105f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #2, 103f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #1, 102f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 109f\n" + "102:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 109f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 109f\n" + "103:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 104f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 109f\n" + "104:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 109f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 109f\n" + "105:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 107f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x10, #1, 106f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 109f\n" + "106:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 109f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 109f\n" + "107:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 108f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 109f\n" + "108:" // Height 3: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "109:" // Height 3: Partial direct writeback: Done + "b 111f\n" + "110:" // Height 3: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "111:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 76b\n" + "b 224f\n" + "112:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "113:" // Height 4: Column loop + "tbz %x[flags], #0, 124f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 122f\n" + "tbz x10, #3, 117f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x10, #2, 115f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x10, #1, 114f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 121f\n" + "114:" // Height 4: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 121f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 121f\n" + "115:" // Height 4: Partial accumulate: partial_2_8 + "tbz x10, #1, 116f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 121f\n" + "116:" // Height 4: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 121f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 121f\n" + "117:" // Height 4: Partial accumulate: partial_4_0 + "tbz x10, #2, 119f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "tbz x10, #1, 118f\n" + "mov x24, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 121f\n" + "118:" // Height 4: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 121f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "b 121f\n" + "119:" // Height 4: Partial accumulate: partial_2_0 + "tbz x10, #1, 120f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "b 121f\n" + "120:" // Height 4: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "121:" // Height 4: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 123f\n" + "122:" // Height 4: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "123:" // Height 4: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 125f\n" + "124:" // Height 4: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "125:" // Height 4: setup done + "mov x27, #0x0\n" + "126:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 127f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 128f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 128f\n" + "127:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "128:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 131f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 130f\n" + "129:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x22, x22, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "cmp x26, #0x20\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "bge 129b\n" + "130:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x23, x23, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "131:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 138f\n" + "cmp x26, #0x8\n" + "blt 133f\n" + "132:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + "bge 132b\n" + "cbz x26, 138f\n" + "133:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #2, 135f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "tbz x26, #1, 134f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "tbz x26, #0, 137f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "b 137f\n" + "134:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 137f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "b 137f\n" + "135:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 136f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x26, #0, 137f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 137f\n" + "136:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "137:" // Height 4: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "138:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 126b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "bge 147f\n" + "tbz x10, #3, 142f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v17.4s }, [x21], #0x10\n" + "tbz x10, #2, 140f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v18.4s }, [x21], #0x10\n" + "tbz x10, #1, 139f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d19, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v19.s }[2], [x21]\n" + "b 146f\n" + "139:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 146f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s19, [x21, #0x0]\n" + "b 146f\n" + "140:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 141f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d18, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v18.s }[2], [x21]\n" + "b 146f\n" + "141:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 146f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s18, [x21, #0x0]\n" + "b 146f\n" + "142:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 144f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "tbz x10, #1, 143f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d17, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v17.s }[2], [x21]\n" + "b 146f\n" + "143:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 146f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s17, [x21, #0x0]\n" + "b 146f\n" + "144:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 145f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v16.s }[2], [x21]\n" + "b 146f\n" + "145:" // Height 4: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s16, [x21, #0x0]\n" + "146:" // Height 4: Partial direct writeback: Done + "b 148f\n" + "147:" // Height 4: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q15, [x22, #0x0]\n" + "str q20, [x22, #0x10]\n" + "str q21, [x22, #0x20]\n" + "str q22, [x22, #0x30]\n" + "str q16, [x21, #0x0]\n" + "str q17, [x21, #0x10]\n" + "str q18, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "148:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 113b\n" + "b 224f\n" + "149:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "150:" // Height 5: Column loop + "tbz %x[flags], #0, 161f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 159f\n" + "tbz x10, #3, 154f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #2, 152f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "tbz x10, #1, 151f\n" + "ldr d16, [x28], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d6, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v6.s }[2], [x20]\n" + "b 158f\n" + "151:" // Height 5: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 158f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s6, [x20, #0x0]\n" + "b 158f\n" + "152:" // Height 5: Partial accumulate: partial_2_8 + "tbz x10, #1, 153f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 158f\n" + "153:" // Height 5: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 158f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 158f\n" + "154:" // Height 5: Partial accumulate: partial_4_0 + "tbz x10, #2, 156f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #1, 155f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 158f\n" + "155:" // Height 5: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 158f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "b 158f\n" + "156:" // Height 5: Partial accumulate: partial_2_0 + "tbz x10, #1, 157f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "b 158f\n" + "157:" // Height 5: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "158:" // Height 5: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 160f\n" + "159:" // Height 5: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q25, [x20, #0x0]\n" + "ldr q26, [x20, #0x10]\n" + "ldr q27, [x20, #0x20]\n" + "ldr q6, [x20, #0x30]\n" + "160:" // Height 5: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 162f\n" + "161:" // Height 5: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "162:" // Height 5: setup done + "mov x27, #0x0\n" + "163:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 164f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 165f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 165f\n" + "164:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "165:" // Height 5: input setup done + "cmp x26, #0x10\n" + "blt 168f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 167f\n" + "166:" // Height 5: Multiply loop: Main loop head + "movi v6.4s, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x21, x21, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "bge 166b\n" + "167:" // Height 5: Multiply loop: Single iteration only + "movi v6.4s, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "168:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 175f\n" + "cmp x26, #0x8\n" + "blt 170f\n" + "169:" // Height 5: Multiply loop: Odd block loop + "movi v7.4s, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr q6, [x9, #0x0]\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + "bge 169b\n" + "cbz x26, 175f\n" + "170:" // Height 5: Multiply loop: Skip odd blocks + "tbz x26, #2, 172f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "tbz x26, #1, 171f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "ld1 { v5.h }[2], [x21], #0x2\n" + "tbz x26, #0, 174f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "ld1 { v5.b }[6], [x21]\n" + "b 174f\n" + "171:" // Height 5: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 174f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "ld1 { v5.b }[4], [x21]\n" + "b 174f\n" + "172:" // Height 5: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 173f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "tbz x26, #0, 174f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x21]\n" + "b 174f\n" + "173:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x21, #0x0]\n" + "174:" // Height 5: Multiply loop: Ragged operand read: Done + "movi v6.4s, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "175:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 163b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "bge 184f\n" + "tbz x10, #3, 179f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v17.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #2, 177f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v18.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #1, 176f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d19, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v19.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 183f\n" + "176:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x10, #0, 183f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s19, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 183f\n" + "177:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x10, #1, 178f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d18, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v18.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 183f\n" + "178:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x10, #0, 183f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s18, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 183f\n" + "179:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x10, #2, 181f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x10, #1, 180f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d17, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v17.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 183f\n" + "180:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x10, #0, 183f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s17, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 183f\n" + "181:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x10, #1, 182f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v16.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 183f\n" + "182:" // Height 5: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s16, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "183:" // Height 5: Partial direct writeback: Done + "b 185f\n" + "184:" // Height 5: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q15, [x22, #0x0]\n" + "str q20, [x22, #0x10]\n" + "str q21, [x22, #0x20]\n" + "str q22, [x22, #0x30]\n" + "str q16, [x21, #0x0]\n" + "str q17, [x21, #0x10]\n" + "str q18, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "185:" // Height 5: Writeback done + "subs x10, x10, #0x10\n" + "bgt 150b\n" + "b 224f\n" + "186:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "187:" // Height 6: Column loop + "tbz %x[flags], #0, 198f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "add x19, x20, x19, LSL #2\n" + "bge 196f\n" + "tbz x10, #3, 191f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "ld1 { v29.4s }, [x19], #0x10\n" + "tbz x10, #2, 189f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "ld1 { v30.4s }, [x19], #0x10\n" + "tbz x10, #1, 188f\n" + "ldr d16, [x28], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d6, [x20], #0x8\n" + "ldr d31, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v6.s }[2], [x20]\n" + "ld1 { v31.s }[2], [x19]\n" + "b 195f\n" + "188:" // Height 6: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 195f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s6, [x20, #0x0]\n" + "ldr s31, [x19, #0x0]\n" + "b 195f\n" + "189:" // Height 6: Partial accumulate: partial_2_8 + "tbz x10, #1, 190f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "ldr d30, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "ld1 { v30.s }[2], [x19]\n" + "b 195f\n" + "190:" // Height 6: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 195f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "ldr s30, [x19, #0x0]\n" + "b 195f\n" + "191:" // Height 6: Partial accumulate: partial_4_0 + "tbz x10, #2, 193f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "tbz x10, #1, 192f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "ldr d29, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "ld1 { v29.s }[2], [x19]\n" + "b 195f\n" + "192:" // Height 6: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 195f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "ldr s29, [x19, #0x0]\n" + "b 195f\n" + "193:" // Height 6: Partial accumulate: partial_2_0 + "tbz x10, #1, 194f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "ldr d28, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "ld1 { v28.s }[2], [x19]\n" + "b 195f\n" + "194:" // Height 6: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "ldr s28, [x19, #0x0]\n" + "195:" // Height 6: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 197f\n" + "196:" // Height 6: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q25, [x20, #0x0]\n" + "ldr q26, [x20, #0x10]\n" + "ldr q27, [x20, #0x20]\n" + "ldr q6, [x20, #0x30]\n" + "ldr q28, [x19, #0x0]\n" + "ldr q29, [x19, #0x10]\n" + "ldr q30, [x19, #0x20]\n" + "ldr q31, [x19, #0x30]\n" + "197:" // Height 6: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 199f\n" + "198:" // Height 6: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "199:" // Height 6: setup done + "mov x27, #0x0\n" + "200:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 201f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 202f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 202f\n" + "201:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "202:" // Height 6: input setup done + "cmp x26, #0x10\n" + "blt 205f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 204f\n" + "203:" // Height 6: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x20, x20, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "bge 203b\n" + "204:" // Height 6: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x21, x21, #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x20, x20, #0x10\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "205:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 212f\n" + "cmp x26, #0x8\n" + "blt 207f\n" + "206:" // Height 6: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr d7, [x20], #0x8\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + "bge 206b\n" + "cbz x26, 212f\n" + "207:" // Height 6: Multiply loop: Skip odd blocks + "tbz x26, #2, 209f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "ldr s6, [x20], #0x4\n" + "tbz x26, #1, 208f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "ld1 { v5.h }[2], [x21], #0x2\n" + "ld1 { v6.h }[2], [x20], #0x2\n" + "tbz x26, #0, 211f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "ld1 { v5.b }[6], [x21]\n" + "ld1 { v6.b }[6], [x20]\n" + "b 211f\n" + "208:" // Height 6: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 211f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "ld1 { v5.b }[4], [x21]\n" + "ld1 { v6.b }[4], [x20]\n" + "b 211f\n" + "209:" // Height 6: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 210f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "ldr h6, [x20], #0x2\n" + "tbz x26, #0, 211f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x21]\n" + "ld1 { v6.b }[2], [x20]\n" + "b 211f\n" + "210:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x21, #0x0]\n" + "ldr b6, [x20, #0x0]\n" + "211:" // Height 6: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "212:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 200b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19, LSL #2\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x19, #0x0]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "bge 221f\n" + "tbz x10, #3, 216f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v17.4s }, [x21], #0x10\n" + "st1 { v23.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "st1 { v24.4s }, [x19], #0x10\n" + "st1 { v25.4s }, [x19], #0x10\n" + "tbz x10, #2, 214f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v18.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x20], #0x10\n" + "st1 { v26.4s }, [x19], #0x10\n" + "tbz x10, #1, 213f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d19, [x21], #0x8\n" + "str d30, [x20], #0x8\n" + "str d27, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v19.s }[2], [x21]\n" + "st1 { v30.s }[2], [x20]\n" + "st1 { v27.s }[2], [x19]\n" + "b 220f\n" + "213:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x10, #0, 220f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s19, [x21, #0x0]\n" + "str s30, [x20, #0x0]\n" + "str s27, [x19, #0x0]\n" + "b 220f\n" + "214:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x10, #1, 215f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d18, [x21], #0x8\n" + "str d29, [x20], #0x8\n" + "str d26, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v18.s }[2], [x21]\n" + "st1 { v29.s }[2], [x20]\n" + "st1 { v26.s }[2], [x19]\n" + "b 220f\n" + "215:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x10, #0, 220f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s18, [x21, #0x0]\n" + "str s29, [x20, #0x0]\n" + "str s26, [x19, #0x0]\n" + "b 220f\n" + "216:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x10, #2, 218f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v23.4s }, [x20], #0x10\n" + "st1 { v24.4s }, [x19], #0x10\n" + "tbz x10, #1, 217f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d17, [x21], #0x8\n" + "str d28, [x20], #0x8\n" + "str d25, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v17.s }[2], [x21]\n" + "st1 { v28.s }[2], [x20]\n" + "st1 { v25.s }[2], [x19]\n" + "b 220f\n" + "217:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x10, #0, 220f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s17, [x21, #0x0]\n" + "str s28, [x20, #0x0]\n" + "str s25, [x19, #0x0]\n" + "b 220f\n" + "218:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x10, #1, 219f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "str d23, [x20], #0x8\n" + "str d24, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v16.s }[2], [x21]\n" + "st1 { v23.s }[2], [x20]\n" + "st1 { v24.s }[2], [x19]\n" + "b 220f\n" + "219:" // Height 6: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s16, [x21, #0x0]\n" + "str s23, [x20, #0x0]\n" + "str s24, [x19, #0x0]\n" + "220:" // Height 6: Partial direct writeback: Done + "b 222f\n" + "221:" // Height 6: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q15, [x22, #0x0]\n" + "str q20, [x22, #0x10]\n" + "str q21, [x22, #0x20]\n" + "str q22, [x22, #0x30]\n" + "str q16, [x21, #0x0]\n" + "str q17, [x21, #0x10]\n" + "str q18, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "str q23, [x20, #0x0]\n" + "str q28, [x20, #0x10]\n" + "str q29, [x20, #0x20]\n" + "str q30, [x20, #0x30]\n" + "str q24, [x19, #0x0]\n" + "str q25, [x19, #0x10]\n" + "str q26, [x19, #0x20]\n" + "str q27, [x19, #0x30]\n" + "222:" // Height 6: Writeback done + "subs x10, x10, #0x10\n" + "bgt 187b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 224f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 223f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "223:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "224:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp index 5d9d84815a..ebc43425b8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -44,7 +44,8 @@ void a64_hybrid_u8qa_dot_4x16_a55( ARGLIST ); class cls_a64_hybrid_u8qa_dot_4x16 { public: - typedef uint8_t operand_type; + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; typedef uint8_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -70,16 +71,24 @@ public: return false; } - StdTransformsFixed transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 7.5301 }; - default: - return { 27.5482 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 7.5301 }; + default: + return { 27.5482 }; + case CPUModel::A510: + return { 14.81 }; + case CPUModel::V1: + return { 48.36 }; + } } + + return { 1.0 }; } // Default to the generic kernel @@ -99,4 +108,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp index 954e2891fb..c410374357 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp @@ -406,10 +406,10 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "b 122f\n" "31:" // Height 2 "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" - "movi v15.16b, #0x1\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "movi v12.4s, #0x0\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "movi v15.16b, #0x1\n" "mov x9, %x[col_bias]\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov x28, %x[output_ptr]\n" @@ -853,12 +853,12 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "b 122f\n" "61:" // Height 3 "movi v11.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "movi v12.4s, #0x0\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v13.4s, #0x0\n" - "movi v15.16b, #0x1\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "mov x9, %x[col_bias]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov x28, %x[output_ptr]\n" "62:" // Height 3: Column loop @@ -1426,14 +1426,14 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "b 122f\n" "91:" // Height 4 "movi v11.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "movi v12.4s, #0x0\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v13.4s, #0x0\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "movi v14.4s, #0x0\n" - "movi v15.16b, #0x1\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "mov x9, %x[col_bias]\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov x28, %x[output_ptr]\n" "mov x19, #0x4\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp index 6e85eec204..4fc680c45b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp @@ -283,16 +283,16 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqrdmulh v19.4s, v19.4s, v4.4s\n" "tbz %x[flags], #5, 20f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v16.4s, v16.4s, v4.4s\n" - "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v5.4s\n" "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "20:" // Height 1: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" @@ -612,8 +612,8 @@ void a64_hybrid_u8qa_dot_4x16 ( "ld1r { v2.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "addp v12.4s, v12.4s, v12.4s\n" "neg v2.4s, v2.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" "mul v11.4s, v11.4s, v2.4s\n" "mul v12.4s, v12.4s, v2.4s\n" "49:" // Height 2: skip row sum fixup @@ -653,27 +653,27 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqrdmulh v23.4s, v23.4s, v4.4s\n" "tbz %x[flags], #5, 50f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" "and v7.16b, v19.16b, v0.16b\n" "and v8.16b, v20.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "and v9.16b, v21.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v10.16b, v22.16b, v0.16b\n" "sshr v8.4s, v8.4s, #0x1f\n" - "and v4.16b, v23.16b, v0.16b\n" "sshr v9.4s, v9.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "sqadd v20.4s, v20.4s, v8.4s\n" "sqadd v21.4s, v21.4s, v9.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" "50:" // Height 2: no shift correction @@ -690,8 +690,6 @@ void a64_hybrid_u8qa_dot_4x16 ( "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "srshl v22.4s, v22.4s, v0.4s\n" - "srshl v23.4s, v23.4s, v0.4s\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -710,16 +708,18 @@ void a64_hybrid_u8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "smax v20.4s, v20.4s, v5.4s\n" "smax v21.4s, v21.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "add v22.4s, v22.4s, v4.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" "smin v22.4s, v22.4s, v6.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" "smax v22.4s, v22.4s, v5.4s\n" "smax v23.4s, v23.4s, v5.4s\n" - "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v20.16b, v20.16b, v21.16b\n" "bge 59f\n" @@ -1094,9 +1094,9 @@ void a64_hybrid_u8qa_dot_4x16 ( "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v11.4s, v11.4s, v11.4s\n" + "neg v3.4s, v3.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v3.4s, v3.4s\n" "mul v11.4s, v11.4s, v3.4s\n" "mul v12.4s, v12.4s, v3.4s\n" "mul v13.4s, v13.4s, v3.4s\n" @@ -1149,39 +1149,39 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqrdmulh v27.4s, v27.4s, v4.4s\n" "tbz %x[flags], #5, 80f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" "and v7.16b, v19.16b, v0.16b\n" "and v8.16b, v20.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "and v9.16b, v21.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v10.16b, v22.16b, v0.16b\n" "sshr v8.4s, v8.4s, #0x1f\n" - "and v4.16b, v23.16b, v0.16b\n" "sshr v9.4s, v9.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "sqadd v20.4s, v20.4s, v8.4s\n" "sqadd v21.4s, v21.4s, v9.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v24.4s, v24.4s, v5.4s\n" + "and v6.16b, v25.16b, v0.16b\n" "and v7.16b, v26.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "and v8.16b, v27.16b, v0.16b\n" - "sqadd v25.4s, v25.4s, v6.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v6.4s\n" "sqadd v26.4s, v26.4s, v7.4s\n" "sqadd v27.4s, v27.4s, v8.4s\n" "80:" // Height 3: no shift correction @@ -1198,8 +1198,6 @@ void a64_hybrid_u8qa_dot_4x16 ( "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "srshl v22.4s, v22.4s, v0.4s\n" - "srshl v23.4s, v23.4s, v0.4s\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -1218,31 +1216,33 @@ void a64_hybrid_u8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "smax v20.4s, v20.4s, v5.4s\n" "smax v21.4s, v21.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" "add v22.4s, v22.4s, v4.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "srshl v24.4s, v24.4s, v0.4s\n" + "add v24.4s, v24.4s, v4.4s\n" "smin v22.4s, v22.4s, v6.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "srshl v25.4s, v25.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" "smax v22.4s, v22.4s, v5.4s\n" "smax v23.4s, v23.4s, v5.4s\n" - "add v24.4s, v24.4s, v4.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" "add v25.4s, v25.4s, v4.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" "add v26.4s, v26.4s, v4.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" "add v27.4s, v27.4s, v4.4s\n" - "uzp1 v16.8h, v16.8h, v17.8h\n" "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" "uzp1 v17.8h, v18.8h, v19.8h\n" + "smin v27.4s, v27.4s, v6.4s\n" "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v20.8h, v20.8h, v21.8h\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" "uzp1 v25.8h, v26.8h, v27.8h\n" @@ -1705,10 +1705,10 @@ void a64_hybrid_u8qa_dot_4x16 ( "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" "addp v11.4s, v11.4s, v11.4s\n" + "neg v4.4s, v4.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "neg v4.4s, v4.4s\n" "mul v11.4s, v11.4s, v4.4s\n" "mul v12.4s, v12.4s, v4.4s\n" "mul v13.4s, v13.4s, v4.4s\n" @@ -1774,52 +1774,52 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqrdmulh v31.4s, v31.4s, v4.4s\n" "tbz %x[flags], #5, 110f\n" "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" "and v6.16b, v18.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" "and v7.16b, v19.16b, v0.16b\n" "and v8.16b, v20.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "and v9.16b, v21.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v10.16b, v22.16b, v0.16b\n" "sshr v8.4s, v8.4s, #0x1f\n" - "and v4.16b, v23.16b, v0.16b\n" "sshr v9.4s, v9.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v19.4s, v19.4s, v7.4s\n" "sqadd v20.4s, v20.4s, v8.4s\n" "sqadd v21.4s, v21.4s, v9.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" "sqadd v24.4s, v24.4s, v5.4s\n" + "and v6.16b, v25.16b, v0.16b\n" "and v7.16b, v26.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" "and v8.16b, v27.16b, v0.16b\n" - "and v9.16b, v28.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v8.4s, v8.4s, #0x1f\n" "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "and v9.16b, v28.16b, v0.16b\n" "and v10.16b, v29.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" "and v4.16b, v30.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" "sshr v10.4s, v10.4s, #0x1f\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "and v5.16b, v31.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v27.4s, v27.4s, v8.4s\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v28.4s, v28.4s, v9.4s\n" "sqadd v29.4s, v29.4s, v10.4s\n" "sqadd v30.4s, v30.4s, v4.4s\n" + "and v5.16b, v31.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sqadd v31.4s, v31.4s, v5.4s\n" "110:" // Height 4: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" @@ -1835,8 +1835,6 @@ void a64_hybrid_u8qa_dot_4x16 ( "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "srshl v22.4s, v22.4s, v0.4s\n" - "srshl v23.4s, v23.4s, v0.4s\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -1855,45 +1853,47 @@ void a64_hybrid_u8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "smax v20.4s, v20.4s, v5.4s\n" "smax v21.4s, v21.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" "add v22.4s, v22.4s, v4.4s\n" "add v23.4s, v23.4s, v4.4s\n" - "srshl v24.4s, v24.4s, v0.4s\n" + "add v24.4s, v24.4s, v4.4s\n" "smin v22.4s, v22.4s, v6.4s\n" "smin v23.4s, v23.4s, v6.4s\n" - "srshl v25.4s, v25.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" "smax v22.4s, v22.4s, v5.4s\n" "smax v23.4s, v23.4s, v5.4s\n" - "add v24.4s, v24.4s, v4.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" "add v25.4s, v25.4s, v4.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" "add v26.4s, v26.4s, v4.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" "add v27.4s, v27.4s, v4.4s\n" - "srshl v28.4s, v28.4s, v0.4s\n" "smin v26.4s, v26.4s, v6.4s\n" + "add v28.4s, v28.4s, v4.4s\n" "smin v27.4s, v27.4s, v6.4s\n" - "srshl v29.4s, v29.4s, v0.4s\n" "smax v26.4s, v26.4s, v5.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" "smax v27.4s, v27.4s, v5.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" "add v30.4s, v30.4s, v4.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" "add v31.4s, v31.4s, v4.4s\n" - "uzp1 v16.8h, v16.8h, v17.8h\n" "smin v30.4s, v30.4s, v6.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" "smin v31.4s, v31.4s, v6.4s\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" "smax v30.4s, v30.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v20.8h, v20.8h, v21.8h\n" "uzp1 v21.8h, v22.8h, v23.8h\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp new file mode 100644 index 0000000000..8a47701a4a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_u8qa_mmla_4x16( ARGLIST ); + +class cls_a64_hybrid_u8qa_mmla_4x16 +{ +public: + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; + typedef uint8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 47.68 }; + case CPUModel::A510: + return { 28.00 }; + case CPUModel::V1: + return { 68.98 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_u8qa_mmla_4x16; + cls_a64_hybrid_u8qa_mmla_4x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp new file mode 100644 index 0000000000..daeb986529 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp @@ -0,0 +1,2104 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_u8qa_mmla_4x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 97f\n" + "cmp %x[M], #0x2\n" + "bgt 65f\n" + "beq 33f\n" + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "movi v15.16b, #0x1\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "blt 11f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q5, [x28, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q6, [x28, #0x10]\n" + "ldr q7, [x28, #0x20]\n" + "ldr q8, [x28, #0x30]\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + "blt 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "add x23, x23, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 8f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "ldr q1, [x23, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q5, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" + "ldr q7, [x28, #0x20]\n" + "ldr q8, [x28, #0x30]\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + "bge 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "sub x24, x24, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "add x23, x23, #0x10\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 10f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + "10:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "11:" // Height 1: Multiply loop: Main loop skip + "cbz x24, 20f\n" + "cmp x24, #0x8\n" + "blt 14f\n" + "12:" // Height 1: Multiply loop: Odd block loop + "movi v2.16b, #0x0\n" + "ldr d1, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 13f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "13:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x30]\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + "bge 12b\n" + "cbz x24, 20f\n" + "14:" // Height 1: Multiply loop: Skip odd blocks + "tbz x24, #2, 16f\n" + "ldr s1, [x23], #0x4\n" + "tbz x24, #1, 15f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "tbz x24, #0, 18f\n" + "ld1 { v1.b }[6], [x23]\n" + "b 18f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 18f\n" + "ld1 { v1.b }[4], [x23]\n" + "b 18f\n" + "16:" // Height 1: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 17f\n" + "ldr h1, [x23], #0x2\n" + "tbz x24, #0, 18f\n" + "ld1 { v1.b }[2], [x23]\n" + "b 18f\n" + "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "18:" // Height 1: Multiply loop: Ragged operand read: Done + "movi v2.16b, #0x0\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 19f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "19:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x40]\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x60]\n" + "ldr q10, [x28, #0x70]\n" + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + "20:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 4b\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "mov v23.16b, v16.16b\n" + "tbnz %x[flags], #31, 21f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v1.4s }, [x22]\n" + "dup v11.4s, v11.s[0]\n" + "neg v1.4s, v1.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" + "21:" // Height 1: skip row sum fixup + "add v23.4s, v23.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v23.4s, v23.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 22f\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "22:" // Height 1: no shift correction + "srshl v23.4s, v23.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v23.8h, v23.8h, v17.8h\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v17.16b\n" + "bge 31f\n" + "tbz x9, #3, 26f\n" + "str d23, [x26], #0x8\n" + "tbz x9, #2, 24f\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "tbz x9, #1, 23f\n" + "st1 { v23.h }[6], [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[14], [x26]\n" + "b 30f\n" + "23:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x9, #0, 30f\n" + "st1 { v23.b }[12], [x26]\n" + "b 30f\n" + "24:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x9, #1, 25f\n" + "st1 { v23.h }[4], [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[10], [x26]\n" + "b 30f\n" + "25:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x9, #0, 30f\n" + "st1 { v23.b }[8], [x26]\n" + "b 30f\n" + "26:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x9, #2, 28f\n" + "str s23, [x26], #0x4\n" + "tbz x9, #1, 27f\n" + "st1 { v23.h }[2], [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[6], [x26]\n" + "b 30f\n" + "27:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x9, #0, 30f\n" + "st1 { v23.b }[4], [x26]\n" + "b 30f\n" + "28:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x9, #1, 29f\n" + "str h23, [x26], #0x2\n" + "tbz x9, #0, 30f\n" + "st1 { v23.b }[2], [x26]\n" + "b 30f\n" + "29:" // Height 1: Partial direct writeback: partial_1_0 + "str b23, [x26, #0x0]\n" + "30:" // Height 1: Partial direct writeback: Done + "b 32f\n" + "31:" // Height 1: Full writeback + "str q23, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "32:" // Height 1: Writeback done + "subs x9, x9, #0x10\n" + "bgt 2b\n" + "b 130f\n" + "33:" // Height 2 + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v15.16b, #0x1\n" + "mov x26, %x[output_ptr]\n" + "34:" // Height 2: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "35:" // Height 2: setup done + "mov x25, #0x0\n" + "36:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 37f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 38f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 38f\n" + "37:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "38:" // Height 2: input setup done + "cmp x24, #0x10\n" + "blt 43f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "blt 41f\n" + "39:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + "ldr q10, [x28, #0x50]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + "ldr q5, [x28, #0x70]\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + "ldr q7, [x28, #0x90]\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q10, [x28, #0xc0]\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + "40:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "bge 39b\n" + "41:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "sub x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + "ldr q8, [x28, #0x30]\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + "ldr q5, [x28, #0x70]\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + "ldr q8, [x28, #0xa0]\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q10, [x28, #0xc0]\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 42f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + "42:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "43:" // Height 2: Multiply loop: Main loop skip + "cbz x24, 52f\n" + "cmp x24, #0x8\n" + "blt 46f\n" + "44:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x23], #0x8\n" + "ldr d2, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 45f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "45:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x30]\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + "bge 44b\n" + "cbz x24, 52f\n" + "46:" // Height 2: Multiply loop: Skip odd blocks + "tbz x24, #2, 48f\n" + "ldr s1, [x23], #0x4\n" + "ldr s2, [x22], #0x4\n" + "tbz x24, #1, 47f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "ld1 { v2.h }[2], [x22], #0x2\n" + "tbz x24, #0, 50f\n" + "ld1 { v1.b }[6], [x23]\n" + "ld1 { v2.b }[6], [x22]\n" + "b 50f\n" + "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 50f\n" + "ld1 { v1.b }[4], [x23]\n" + "ld1 { v2.b }[4], [x22]\n" + "b 50f\n" + "48:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 49f\n" + "ldr h1, [x23], #0x2\n" + "ldr h2, [x22], #0x2\n" + "tbz x24, #0, 50f\n" + "ld1 { v1.b }[2], [x23]\n" + "ld1 { v2.b }[2], [x22]\n" + "b 50f\n" + "49:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "50:" // Height 2: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 51f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "51:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x40]\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x60]\n" + "ldr q10, [x28, #0x70]\n" + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + "52:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 36b\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "mov v23.16b, v4.16b\n" + "tbnz %x[flags], #31, 53f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x22]\n" + "dup v12.4s, v11.s[3]\n" + "dup v11.4s, v11.s[0]\n" + "neg v2.4s, v2.4s\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "53:" // Height 2: skip row sum fixup + "add v23.4s, v23.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v20.4s, v20.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v21.4s, v21.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v22.4s, v22.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v16.4s, v16.4s, v12.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v12.4s\n" + "add v18.4s, v18.4s, v12.4s\n" + "add v19.4s, v19.4s, v12.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 54f\n" + "and v4.16b, v23.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "and v7.16b, v22.16b, v0.16b\n" + "and v8.16b, v16.16b, v0.16b\n" + "and v9.16b, v17.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "sqadd v16.4s, v16.4s, v8.4s\n" + "sqadd v17.4s, v17.4s, v9.4s\n" + "and v10.16b, v18.16b, v0.16b\n" + "and v4.16b, v19.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "54:" // Height 2: no shift correction + "srshl v23.4s, v23.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "uzp1 v23.8h, v23.8h, v20.8h\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 63f\n" + "tbz x9, #3, 58f\n" + "str d23, [x26], #0x8\n" + "str d16, [x21], #0x8\n" + "tbz x9, #2, 56f\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "tbz x9, #1, 55f\n" + "st1 { v23.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[14], [x26]\n" + "st1 { v16.b }[14], [x21]\n" + "b 62f\n" + "55:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x9, #0, 62f\n" + "st1 { v23.b }[12], [x26]\n" + "st1 { v16.b }[12], [x21]\n" + "b 62f\n" + "56:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x9, #1, 57f\n" + "st1 { v23.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[10], [x26]\n" + "st1 { v16.b }[10], [x21]\n" + "b 62f\n" + "57:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x9, #0, 62f\n" + "st1 { v23.b }[8], [x26]\n" + "st1 { v16.b }[8], [x21]\n" + "b 62f\n" + "58:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x9, #2, 60f\n" + "str s23, [x26], #0x4\n" + "str s16, [x21], #0x4\n" + "tbz x9, #1, 59f\n" + "st1 { v23.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[6], [x26]\n" + "st1 { v16.b }[6], [x21]\n" + "b 62f\n" + "59:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x9, #0, 62f\n" + "st1 { v23.b }[4], [x26]\n" + "st1 { v16.b }[4], [x21]\n" + "b 62f\n" + "60:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x9, #1, 61f\n" + "str h23, [x26], #0x2\n" + "str h16, [x21], #0x2\n" + "tbz x9, #0, 62f\n" + "st1 { v23.b }[2], [x26]\n" + "st1 { v16.b }[2], [x21]\n" + "b 62f\n" + "61:" // Height 2: Partial direct writeback: partial_1_0 + "str b23, [x26, #0x0]\n" + "str b16, [x21, #0x0]\n" + "62:" // Height 2: Partial direct writeback: Done + "b 64f\n" + "63:" // Height 2: Full writeback + "str q23, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q16, [x21, #0x0]\n" + "64:" // Height 2: Writeback done + "subs x9, x9, #0x10\n" + "bgt 34b\n" + "b 130f\n" + "65:" // Height 3 + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "mov x26, %x[output_ptr]\n" + "movi v15.16b, #0x1\n" + "66:" // Height 3: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "67:" // Height 3: setup done + "mov x25, #0x0\n" + "68:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 69f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 70f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 70f\n" + "69:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "70:" // Height 3: input setup done + "cmp x24, #0x10\n" + "blt 75f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "blt 73f\n" + "71:" // Height 3: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x20]\n" + "add x21, x21, #0x10\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e88a479 // ummla v25.4s, v3.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 72f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" + "72:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "bge 71b\n" + "73:" // Height 3: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "sub x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q5, [x28, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x28, #0x20]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + "add x21, x21, #0x10\n" + ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" + "ldr q9, [x28, #0x40]\n" + "ldr q10, [x28, #0x50]\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e88a479 // ummla v25.4s, v3.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 74f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" + "74:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "75:" // Height 3: Multiply loop: Main loop skip + "cbz x24, 84f\n" + "cmp x24, #0x8\n" + "blt 78f\n" + "76:" // Height 3: Multiply loop: Odd block loop + "movi v7.16b, #0x0\n" + "ldr d1, [x23], #0x8\n" + "ldr d2, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x21], #0x8\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 77f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "77:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n" + "bge 76b\n" + "cbz x24, 84f\n" + "78:" // Height 3: Multiply loop: Skip odd blocks + "tbz x24, #2, 80f\n" + "ldr s1, [x23], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr s3, [x21], #0x4\n" + "tbz x24, #1, 79f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "ld1 { v2.h }[2], [x22], #0x2\n" + "ld1 { v3.h }[2], [x21], #0x2\n" + "tbz x24, #0, 82f\n" + "ld1 { v1.b }[6], [x23]\n" + "ld1 { v2.b }[6], [x22]\n" + "ld1 { v3.b }[6], [x21]\n" + "b 82f\n" + "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 82f\n" + "ld1 { v1.b }[4], [x23]\n" + "ld1 { v2.b }[4], [x22]\n" + "ld1 { v3.b }[4], [x21]\n" + "b 82f\n" + "80:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 81f\n" + "ldr h1, [x23], #0x2\n" + "ldr h2, [x22], #0x2\n" + "ldr h3, [x21], #0x2\n" + "tbz x24, #0, 82f\n" + "ld1 { v1.b }[2], [x23]\n" + "ld1 { v2.b }[2], [x22]\n" + "ld1 { v3.b }[2], [x21]\n" + "b 82f\n" + "81:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "ldr b3, [x21, #0x0]\n" + "82:" // Height 3: Multiply loop: Ragged operand read: Done + "movi v9.16b, #0x0\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v9.2d\n" + "tbnz %x[flags], #31, 83f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "83:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n" + "ldr q5, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + "ldr q9, [x28, #0x60]\n" + ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" + "ldr q10, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n" + "84:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 68b\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "add x20, x21, x19\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 85f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x22]\n" + "addp v13.4s, v13.4s, v13.4s\n" + "dup v12.4s, v11.s[3]\n" + "dup v11.4s, v11.s[0]\n" + "neg v3.4s, v3.4s\n" + "dup v13.4s, v13.s[0]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "85:" // Height 3: skip row sum fixup + "add v31.4s, v31.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v20.4s, v20.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v21.4s, v21.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v22.4s, v22.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v16.4s, v16.4s, v12.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v12.4s\n" + "add v18.4s, v18.4s, v12.4s\n" + "add v19.4s, v19.4s, v12.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 86f\n" + "and v4.16b, v31.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v4.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "and v7.16b, v22.16b, v0.16b\n" + "and v8.16b, v16.16b, v0.16b\n" + "and v9.16b, v17.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "sqadd v16.4s, v16.4s, v8.4s\n" + "sqadd v17.4s, v17.4s, v9.4s\n" + "and v10.16b, v18.16b, v0.16b\n" + "and v4.16b, v19.16b, v0.16b\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "and v6.16b, v25.16b, v0.16b\n" + "and v7.16b, v26.16b, v0.16b\n" + "and v8.16b, v27.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "86:" // Height 3: no shift correction + "srshl v31.4s, v31.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "uzp1 v31.8h, v31.8h, v20.8h\n" + "add v26.4s, v26.4s, v4.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v20.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 95f\n" + "tbz x9, #3, 90f\n" + "str d31, [x26], #0x8\n" + "str d16, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x9, #2, 88f\n" + "st1 { v31.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x9, #1, 87f\n" + "st1 { v31.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[14], [x26]\n" + "st1 { v16.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 94f\n" + "87:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x9, #0, 94f\n" + "st1 { v31.b }[12], [x26]\n" + "st1 { v16.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 94f\n" + "88:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x9, #1, 89f\n" + "st1 { v31.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[10], [x26]\n" + "st1 { v16.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 94f\n" + "89:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x9, #0, 94f\n" + "st1 { v31.b }[8], [x26]\n" + "st1 { v16.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 94f\n" + "90:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x9, #2, 92f\n" + "str s31, [x26], #0x4\n" + "str s16, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x9, #1, 91f\n" + "st1 { v31.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[6], [x26]\n" + "st1 { v16.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 94f\n" + "91:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x9, #0, 94f\n" + "st1 { v31.b }[4], [x26]\n" + "st1 { v16.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 94f\n" + "92:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x9, #1, 93f\n" + "str h31, [x26], #0x2\n" + "str h16, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x9, #0, 94f\n" + "st1 { v31.b }[2], [x26]\n" + "st1 { v16.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 94f\n" + "93:" // Height 3: Partial direct writeback: partial_1_0 + "str b31, [x26, #0x0]\n" + "str b16, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "94:" // Height 3: Partial direct writeback: Done + "b 96f\n" + "95:" // Height 3: Full writeback + "str q31, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q16, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "96:" // Height 3: Writeback done + "subs x9, x9, #0x10\n" + "bgt 66b\n" + "b 130f\n" + "97:" // Height 4 + "movi v11.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" + "movi v14.4s, #0x0\n" + "mov x19, #0x4\n" + "movi v15.16b, #0x1\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "98:" // Height 4: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "99:" // Height 4: setup done + "mov x25, #0x0\n" + "100:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 101f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x25, 102f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 102f\n" + "101:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "102:" // Height 4: input setup done + "cmp x24, #0x10\n" + "blt 107f\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "blt 105f\n" + "103:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x20, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x21, x21, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x20, x20, #0x10\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e88a479 // ummla v25.4s, v3.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 104f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" + "104:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr q1, [x23, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "bge 103b\n" + "105:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x21, #0x0]\n" + "sub x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x20, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x28, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x28, #0x10]\n" + "add x21, x21, #0x10\n" + ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" + "ldr q7, [x28, #0x20]\n" + "add x20, x20, #0x10\n" + ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" + "ldr q8, [x28, #0x30]\n" + ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" + "ldr q9, [x28, #0x40]\n" + ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" + "ldr q10, [x28, #0x50]\n" + "ldr q4, [x28, #0x60]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + "ldr q5, [x28, #0x70]\n" + ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" + "ldr q6, [x28, #0x80]\n" + ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" + "ldr q7, [x28, #0x90]\n" + ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" + "ldr q8, [x28, #0xa0]\n" + ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" + "ldr q9, [x28, #0xb0]\n" + ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" + "ldr q10, [x28, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" + "ldr q5, [x28, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e88a479 // ummla v25.4s, v3.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 106f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" + "106:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "107:" // Height 4: Multiply loop: Main loop skip + "cbz x24, 116f\n" + "cmp x24, #0x8\n" + "blt 110f\n" + "108:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x23], #0x8\n" + "ldr d2, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x21], #0x8\n" + "ldr d7, [x20], #0x8\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 109f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "109:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q8, [x28, #0x0]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q9, [x28, #0x10]\n" + "sub x24, x24, #0x8\n" + ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n" + "ldr q10, [x28, #0x20]\n" + "cmp x24, #0x8\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + "ldr q6, [x28, #0x50]\n" + ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n" + "ldr q7, [x28, #0x60]\n" + "ldr q8, [x28, #0x70]\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + "add x28, x28, #0x80\n" + ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n" + "bge 108b\n" + "cbz x24, 116f\n" + "110:" // Height 4: Multiply loop: Skip odd blocks + "tbz x24, #2, 112f\n" + "ldr s1, [x23], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr s3, [x21], #0x4\n" + "ldr s9, [x20], #0x4\n" + "tbz x24, #1, 111f\n" + "ld1 { v1.h }[2], [x23], #0x2\n" + "ld1 { v2.h }[2], [x22], #0x2\n" + "ld1 { v3.h }[2], [x21], #0x2\n" + "ld1 { v9.h }[2], [x20], #0x2\n" + "tbz x24, #0, 114f\n" + "ld1 { v1.b }[6], [x23]\n" + "ld1 { v2.b }[6], [x22]\n" + "ld1 { v3.b }[6], [x21]\n" + "ld1 { v9.b }[6], [x20]\n" + "b 114f\n" + "111:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x24, #0, 114f\n" + "ld1 { v1.b }[4], [x23]\n" + "ld1 { v2.b }[4], [x22]\n" + "ld1 { v3.b }[4], [x21]\n" + "ld1 { v9.b }[4], [x20]\n" + "b 114f\n" + "112:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x24, #1, 113f\n" + "ldr h1, [x23], #0x2\n" + "ldr h2, [x22], #0x2\n" + "ldr h3, [x21], #0x2\n" + "ldr h9, [x20], #0x2\n" + "tbz x24, #0, 114f\n" + "ld1 { v1.b }[2], [x23]\n" + "ld1 { v2.b }[2], [x22]\n" + "ld1 { v3.b }[2], [x21]\n" + "ld1 { v9.b }[2], [x20]\n" + "b 114f\n" + "113:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x23, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "ldr b3, [x21, #0x0]\n" + "ldr b9, [x20, #0x0]\n" + "114:" // Height 4: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v9.2d\n" + "tbnz %x[flags], #31, 115f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "115:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n" + "ldr q5, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + "ldr q7, [x28, #0x40]\n" + ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n" + "ldr q8, [x28, #0x50]\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + "ldr q9, [x28, #0x60]\n" + ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" + "ldr q10, [x28, #0x70]\n" + "add x28, x28, #0x80\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n" + "116:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 100b\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "add x20, x21, x19\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x19, #0x0]\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 117f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "addp v13.4s, v13.4s, v13.4s\n" + "dup v12.4s, v11.s[3]\n" + "dup v11.4s, v11.s[0]\n" + "neg v4.4s, v4.4s\n" + "dup v14.4s, v13.s[3]\n" + "dup v13.4s, v13.s[0]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "117:" // Height 4: skip row sum fixup + "add v31.4s, v31.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v20.4s, v20.4s, v11.4s\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add v21.4s, v21.4s, v11.4s\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v22.4s, v22.4s, v11.4s\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" + "add v16.4s, v16.4s, v12.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v17.4s, v17.4s, v12.4s\n" + "add v18.4s, v18.4s, v12.4s\n" + "add v19.4s, v19.4s, v12.4s\n" + "add v23.4s, v23.4s, v13.4s\n" + "add v28.4s, v28.4s, v13.4s\n" + "add v29.4s, v29.4s, v13.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "add v24.4s, v24.4s, v14.4s\n" + "add v25.4s, v25.4s, v14.4s\n" + "add v26.4s, v26.4s, v14.4s\n" + "add v27.4s, v27.4s, v14.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v28.4s, v28.4s, v1.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v4.4s\n" + "sqrdmulh v30.4s, v30.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 118f\n" + "and v4.16b, v31.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v4.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "and v7.16b, v22.16b, v0.16b\n" + "and v8.16b, v16.16b, v0.16b\n" + "and v9.16b, v17.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v7.4s\n" + "sqadd v16.4s, v16.4s, v8.4s\n" + "sqadd v17.4s, v17.4s, v9.4s\n" + "and v10.16b, v18.16b, v0.16b\n" + "and v4.16b, v19.16b, v0.16b\n" + "and v5.16b, v23.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sqadd v23.4s, v23.4s, v5.4s\n" + "and v6.16b, v28.16b, v0.16b\n" + "and v7.16b, v29.16b, v0.16b\n" + "and v8.16b, v30.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v6.4s\n" + "sqadd v29.4s, v29.4s, v7.4s\n" + "sqadd v30.4s, v30.4s, v8.4s\n" + "and v9.16b, v24.16b, v0.16b\n" + "and v10.16b, v25.16b, v0.16b\n" + "and v4.16b, v26.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v10.4s\n" + "sqadd v26.4s, v26.4s, v4.4s\n" + "and v5.16b, v27.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v5.4s\n" + "118:" // Height 4: no shift correction + "srshl v31.4s, v31.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "add x22, %x[qp], %[minval]\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "srshl v30.4s, v30.4s, v0.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "uzp1 v31.8h, v31.8h, v20.8h\n" + "add v26.4s, v26.4s, v4.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "uzp1 v20.8h, v21.8h, v22.8h\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.8h, v23.8h, v28.8h\n" + "uzp1 v28.8h, v29.8h, v30.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v20.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v23.16b, v23.16b, v28.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 127f\n" + "tbz x9, #3, 122f\n" + "str d31, [x26], #0x8\n" + "str d16, [x21], #0x8\n" + "str d23, [x20], #0x8\n" + "str d24, [x19], #0x8\n" + "tbz x9, #2, 120f\n" + "st1 { v31.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x21], #0x4\n" + "st1 { v23.s }[2], [x20], #0x4\n" + "st1 { v24.s }[2], [x19], #0x4\n" + "tbz x9, #1, 119f\n" + "st1 { v31.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x21], #0x2\n" + "st1 { v23.h }[6], [x20], #0x2\n" + "st1 { v24.h }[6], [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[14], [x26]\n" + "st1 { v16.b }[14], [x21]\n" + "st1 { v23.b }[14], [x20]\n" + "st1 { v24.b }[14], [x19]\n" + "b 126f\n" + "119:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x9, #0, 126f\n" + "st1 { v31.b }[12], [x26]\n" + "st1 { v16.b }[12], [x21]\n" + "st1 { v23.b }[12], [x20]\n" + "st1 { v24.b }[12], [x19]\n" + "b 126f\n" + "120:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x9, #1, 121f\n" + "st1 { v31.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x21], #0x2\n" + "st1 { v23.h }[4], [x20], #0x2\n" + "st1 { v24.h }[4], [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[10], [x26]\n" + "st1 { v16.b }[10], [x21]\n" + "st1 { v23.b }[10], [x20]\n" + "st1 { v24.b }[10], [x19]\n" + "b 126f\n" + "121:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x9, #0, 126f\n" + "st1 { v31.b }[8], [x26]\n" + "st1 { v16.b }[8], [x21]\n" + "st1 { v23.b }[8], [x20]\n" + "st1 { v24.b }[8], [x19]\n" + "b 126f\n" + "122:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x9, #2, 124f\n" + "str s31, [x26], #0x4\n" + "str s16, [x21], #0x4\n" + "str s23, [x20], #0x4\n" + "str s24, [x19], #0x4\n" + "tbz x9, #1, 123f\n" + "st1 { v31.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x21], #0x2\n" + "st1 { v23.h }[2], [x20], #0x2\n" + "st1 { v24.h }[2], [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[6], [x26]\n" + "st1 { v16.b }[6], [x21]\n" + "st1 { v23.b }[6], [x20]\n" + "st1 { v24.b }[6], [x19]\n" + "b 126f\n" + "123:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x9, #0, 126f\n" + "st1 { v31.b }[4], [x26]\n" + "st1 { v16.b }[4], [x21]\n" + "st1 { v23.b }[4], [x20]\n" + "st1 { v24.b }[4], [x19]\n" + "b 126f\n" + "124:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x9, #1, 125f\n" + "str h31, [x26], #0x2\n" + "str h16, [x21], #0x2\n" + "str h23, [x20], #0x2\n" + "str h24, [x19], #0x2\n" + "tbz x9, #0, 126f\n" + "st1 { v31.b }[2], [x26]\n" + "st1 { v16.b }[2], [x21]\n" + "st1 { v23.b }[2], [x20]\n" + "st1 { v24.b }[2], [x19]\n" + "b 126f\n" + "125:" // Height 4: Partial direct writeback: partial_1_0 + "str b31, [x26, #0x0]\n" + "str b16, [x21, #0x0]\n" + "str b23, [x20, #0x0]\n" + "str b24, [x19, #0x0]\n" + "126:" // Height 4: Partial direct writeback: Done + "b 128f\n" + "127:" // Height 4: Full writeback + "str q31, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q16, [x21, #0x0]\n" + "str q23, [x20, #0x0]\n" + "str q24, [x19, #0x0]\n" + "128:" // Height 4: Writeback done + "subs x9, x9, #0x10\n" + "bgt 98b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 130f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 129f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "129:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "130:" // Exit + + : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp index da07fc17a1..3a77397632 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef __aarch64__ +#ifdef __aarch64__ #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -44,7 +44,8 @@ void a64_hybrid_u8u32_dot_6x16_a55( ARGLIST ); class cls_a64_hybrid_u8u32_dot_6x16 { public: - typedef uint8_t operand_type; + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; typedef uint32_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -70,16 +71,35 @@ public: return true; } - StdTransformsFixed transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 12.667, 2.0799, 0.2279 }; - default: - return { 29.6736, 11.4025, 0.5591 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 9.5238, 2.0799, 0.2279 }; + default: + return { 29.6736, 11.4025, 0.5591 }; + case CPUModel::A510: + return { 16.65, 3.92, 0.48 }; + case CPUModel::V1: + return { 55.42, 19.29, 0.92 }; + } } + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.63 }; + case CPUModel::A510: + return { 15.89 }; + case CPUModel::V1: + return { 53.87 }; + } + } + + return { 1.0 }; } // Default to the generic kernel @@ -99,4 +119,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp index ba57ad493a..ab0c88a3b2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp @@ -1819,8 +1819,8 @@ void a64_hybrid_u8u32_dot_6x16 ( "ld1 { v22.4s }, [x21], #0x10\n" "ld1 { v26.4s }, [x20], #0x10\n" "tbz x10, #1, 139f\n" - "mov x24, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x24, #0x38\n" "ldr d15, [x23], #0x8\n" "ldr d19, [x22], #0x8\n" "ldr d23, [x21], #0x8\n" @@ -1873,8 +1873,8 @@ void a64_hybrid_u8u32_dot_6x16 ( "ld1 { v20.4s }, [x21], #0x10\n" "ld1 { v24.4s }, [x20], #0x10\n" "tbz x10, #1, 143f\n" - "mov x24, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x24, #0x18\n" "ldr d13, [x23], #0x8\n" "ldr d17, [x22], #0x8\n" "ldr d21, [x21], #0x8\n" @@ -2487,12 +2487,12 @@ void a64_hybrid_u8u32_dot_6x16 ( "ld1 { v16.4s }, [x22], #0x10\n" "ld1 { v20.4s }, [x21], #0x10\n" "ld1 { v24.4s }, [x20], #0x10\n" - "ld1 { v28.4s }, [x19], #0x10\n" "ld1 { v9.4s }, [x28], #0x10\n" "ld1 { v13.4s }, [x23], #0x10\n" "ld1 { v17.4s }, [x22], #0x10\n" "ld1 { v21.4s }, [x21], #0x10\n" "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" "ld1 { v29.4s }, [x19], #0x10\n" "tbz x10, #2, 174f\n" "ld1 { v10.4s }, [x28], #0x10\n" @@ -2502,8 +2502,8 @@ void a64_hybrid_u8u32_dot_6x16 ( "ld1 { v26.4s }, [x20], #0x10\n" "ld1 { v30.4s }, [x19], #0x10\n" "tbz x10, #1, 173f\n" - "mov x24, #0x38\n" "ldr d11, [x28], #0x8\n" + "mov x24, #0x38\n" "ldr d15, [x23], #0x8\n" "ldr d19, [x22], #0x8\n" "ldr d23, [x21], #0x8\n" @@ -2563,8 +2563,8 @@ void a64_hybrid_u8u32_dot_6x16 ( "ld1 { v24.4s }, [x20], #0x10\n" "ld1 { v28.4s }, [x19], #0x10\n" "tbz x10, #1, 177f\n" - "mov x24, #0x18\n" "ldr d9, [x28], #0x8\n" + "mov x24, #0x18\n" "ldr d13, [x23], #0x8\n" "ldr d17, [x22], #0x8\n" "ldr d21, [x21], #0x8\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp new file mode 100644 index 0000000000..24bad3c63e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const uint32_t *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hybrid_u8u32_mmla_6x16( ARGLIST ); + +class cls_a64_hybrid_u8u32_mmla_6x16 +{ +public: + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 55.05 }; + case CPUModel::A510: + return { 30.34 }; + case CPUModel::V1: + return { 83.77 }; + } + } + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 55.31, 15.72, 0.62 }; + case CPUModel::A510: + return { 33.64, 3.92, 0.48 }; + case CPUModel::V1: + return { 86.71, 19.00, 0.93 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_hybrid_u8u32_mmla_6x16; + cls_a64_hybrid_u8u32_mmla_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp new file mode 100644 index 0000000000..fabb3f3efd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp @@ -0,0 +1,3463 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_u8u32_mmla_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const uint32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 186f\n" + "cmp %x[M], #0x4\n" + "bgt 149f\n" + "beq 112f\n" + "cmp %x[M], #0x2\n" + "bgt 75f\n" + "beq 38f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "tbz %x[flags], #0, 13f\n" + "cmp x10, #0x10\n" + "bge 11f\n" + "tbz x10, #3, 6f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #2, 4f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "tbz x10, #1, 3f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v16.s }[2], [x28]\n" + "b 10f\n" + "3:" // Height 1: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 10f\n" + "ldr s16, [x28, #0x0]\n" + "b 10f\n" + "4:" // Height 1: Partial accumulate: partial_2_8 + "tbz x10, #1, 5f\n" + "ldr d11, [x28], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 10f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 10f\n" + "5:" // Height 1: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 10f\n" + "ldr s11, [x28, #0x0]\n" + "b 10f\n" + "6:" // Height 1: Partial accumulate: partial_4_0 + "tbz x10, #2, 8f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #1, 7f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x18\n" + "tbz x10, #0, 10f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 10f\n" + "7:" // Height 1: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 10f\n" + "ldr s10, [x28, #0x0]\n" + "b 10f\n" + "8:" // Height 1: Partial accumulate: partial_2_0 + "tbz x10, #1, 9f\n" + "ldr d9, [x28], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 10f\n" + "9:" // Height 1: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "10:" // Height 1: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 12f\n" + "11:" // Height 1: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "12:" // Height 1: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 14f\n" + "13:" // Height 1: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "14:" // Height 1: setup done + "mov x27, #0x0\n" + "15:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x26, #0x10\n" + "blt 20f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head + "movi v2.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only + "movi v2.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 27f\n" + "cmp x26, #0x8\n" + "blt 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "movi v2.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x0]\n" + "cmp x26, #0x8\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + "ldr q7, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + "bge 21b\n" + "cbz x26, 27f\n" + "22:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #2, 24f\n" + "ldr s1, [x25], #0x4\n" + "tbz x26, #1, 23f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "tbz x26, #0, 26f\n" + "ld1 { v1.b }[6], [x25]\n" + "b 26f\n" + "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 26f\n" + "ld1 { v1.b }[4], [x25]\n" + "b 26f\n" + "24:" // Height 1: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 25f\n" + "ldr h1, [x25], #0x2\n" + "tbz x26, #0, 26f\n" + "ld1 { v1.b }[2], [x25]\n" + "b 26f\n" + "25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "26:" // Height 1: Multiply loop: Ragged operand read: Done + "movi v2.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "27:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 15b\n" + "uzp1 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "uzp1 v9.2d, v9.2d, v13.2d\n" + "cmp x10, #0x10\n" + "uzp1 v10.2d, v10.2d, v14.2d\n" + "uzp1 v11.2d, v11.2d, v15.2d\n" + "bge 36f\n" + "tbz x10, #3, 31f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #2, 29f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #1, 28f\n" + "str d11, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v11.s }[2], [x28]\n" + "b 35f\n" + "28:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 35f\n" + "str s11, [x28, #0x0]\n" + "b 35f\n" + "29:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 30f\n" + "str d10, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v10.s }[2], [x28]\n" + "b 35f\n" + "30:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 35f\n" + "str s10, [x28, #0x0]\n" + "b 35f\n" + "31:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 33f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x10, #1, 32f\n" + "str d9, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v9.s }[2], [x28]\n" + "b 35f\n" + "32:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 35f\n" + "str s9, [x28, #0x0]\n" + "b 35f\n" + "33:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 34f\n" + "str d8, [x28], #0x8\n" + "tbz x10, #0, 35f\n" + "st1 { v8.s }[2], [x28]\n" + "b 35f\n" + "34:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "35:" // Height 1: Partial direct writeback: Done + "b 37f\n" + "36:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "37:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" + "bgt 2b\n" + "b 224f\n" + "38:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "39:" // Height 2: Column loop + "tbz %x[flags], #0, 50f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "bge 48f\n" + "tbz x10, #3, 43f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "tbz x10, #2, 41f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "tbz x10, #1, 40f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "tbz x10, #0, 47f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "b 47f\n" + "40:" // Height 2: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 47f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "b 47f\n" + "41:" // Height 2: Partial accumulate: partial_2_8 + "tbz x10, #1, 42f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 47f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "b 47f\n" + "42:" // Height 2: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 47f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "b 47f\n" + "43:" // Height 2: Partial accumulate: partial_4_0 + "tbz x10, #2, 45f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "tbz x10, #1, 44f\n" + "mov x24, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "tbz x10, #0, 47f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "b 47f\n" + "44:" // Height 2: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 47f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "b 47f\n" + "45:" // Height 2: Partial accumulate: partial_2_0 + "tbz x10, #1, 46f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 47f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "b 47f\n" + "46:" // Height 2: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "47:" // Height 2: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 49f\n" + "48:" // Height 2: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "49:" // Height 2: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 51f\n" + "50:" // Height 2: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "51:" // Height 2: setup done + "mov x27, #0x0\n" + "52:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 53f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 54f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 54f\n" + "53:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "54:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 57f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 56f\n" + "55:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x24, x24, #0x10\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "sub x26, x26, #0x10\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "cmp x26, #0x20\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + "bge 55b\n" + "56:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x25, x25, #0x10\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "add x24, x24, #0x10\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "57:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 64f\n" + "cmp x26, #0x8\n" + "blt 59f\n" + "58:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x0]\n" + "cmp x26, #0x8\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + "ldr q7, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + "bge 58b\n" + "cbz x26, 64f\n" + "59:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #2, 61f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "tbz x26, #1, 60f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "tbz x26, #0, 63f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "b 63f\n" + "60:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 63f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "b 63f\n" + "61:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 62f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "tbz x26, #0, 63f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "b 63f\n" + "62:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "63:" // Height 2: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "64:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 52b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "bge 73f\n" + "tbz x10, #3, 68f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "tbz x10, #2, 66f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "tbz x10, #1, 65f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "b 72f\n" + "65:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 72f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "b 72f\n" + "66:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 67f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "b 72f\n" + "67:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 72f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "b 72f\n" + "68:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 70f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "tbz x10, #1, 69f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "b 72f\n" + "69:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 72f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "b 72f\n" + "70:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 71f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "tbz x10, #0, 72f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "b 72f\n" + "71:" // Height 2: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "72:" // Height 2: Partial direct writeback: Done + "b 74f\n" + "73:" // Height 2: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "74:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 39b\n" + "b 224f\n" + "75:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "76:" // Height 3: Column loop + "tbz %x[flags], #0, 87f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 85f\n" + "tbz x10, #3, 80f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #2, 78f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "tbz x10, #1, 77f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "b 84f\n" + "77:" // Height 3: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 84f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "b 84f\n" + "78:" // Height 3: Partial accumulate: partial_2_8 + "tbz x10, #1, 79f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 84f\n" + "79:" // Height 3: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 84f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 84f\n" + "80:" // Height 3: Partial accumulate: partial_4_0 + "tbz x10, #2, 82f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #1, 81f\n" + "mov x24, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 84f\n" + "81:" // Height 3: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 84f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 84f\n" + "82:" // Height 3: Partial accumulate: partial_2_0 + "tbz x10, #1, 83f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x10, #0, 84f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 84f\n" + "83:" // Height 3: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "84:" // Height 3: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 86f\n" + "85:" // Height 3: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "86:" // Height 3: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 88f\n" + "87:" // Height 3: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "88:" // Height 3: setup done + "mov x27, #0x0\n" + "89:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 90f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 91f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 91f\n" + "90:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "91:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 94f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 93f\n" + "92:" // Height 3: Multiply loop: Main loop head + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + "bge 92b\n" + "93:" // Height 3: Multiply loop: Single iteration only + "movi v4.16b, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + "94:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 101f\n" + "cmp x26, #0x8\n" + "blt 96f\n" + "95:" // Height 3: Multiply loop: Odd block loop + "movi v4.16b, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + "bge 95b\n" + "cbz x26, 101f\n" + "96:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #2, 98f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "tbz x26, #1, 97f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "tbz x26, #0, 100f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "b 100f\n" + "97:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 100f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "b 100f\n" + "98:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 99f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "tbz x26, #0, 100f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "b 100f\n" + "99:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "100:" // Height 3: Multiply loop: Ragged operand read: Done + "movi v4.16b, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q6, [x9, #0x10]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + "101:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 89b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "bge 110f\n" + "tbz x10, #3, 105f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #2, 103f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #1, 102f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 109f\n" + "102:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 109f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 109f\n" + "103:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 104f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 109f\n" + "104:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 109f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 109f\n" + "105:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 107f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x10, #1, 106f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 109f\n" + "106:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 109f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 109f\n" + "107:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 108f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x10, #0, 109f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 109f\n" + "108:" // Height 3: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "109:" // Height 3: Partial direct writeback: Done + "b 111f\n" + "110:" // Height 3: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "111:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 76b\n" + "b 224f\n" + "112:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "113:" // Height 4: Column loop + "tbz %x[flags], #0, 124f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 122f\n" + "tbz x10, #3, 117f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x10, #2, 115f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x10, #1, 114f\n" + "mov x24, #0x38\n" + "ldr d16, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 121f\n" + "114:" // Height 4: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 121f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 121f\n" + "115:" // Height 4: Partial accumulate: partial_2_8 + "tbz x10, #1, 116f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 121f\n" + "116:" // Height 4: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 121f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 121f\n" + "117:" // Height 4: Partial accumulate: partial_4_0 + "tbz x10, #2, 119f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "tbz x10, #1, 118f\n" + "mov x24, #0x18\n" + "ldr d10, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 121f\n" + "118:" // Height 4: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 121f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "b 121f\n" + "119:" // Height 4: Partial accumulate: partial_2_0 + "tbz x10, #1, 120f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "tbz x10, #0, 121f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "b 121f\n" + "120:" // Height 4: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "121:" // Height 4: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 123f\n" + "122:" // Height 4: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "123:" // Height 4: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 125f\n" + "124:" // Height 4: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "125:" // Height 4: setup done + "mov x27, #0x0\n" + "126:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 127f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 128f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 128f\n" + "127:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "128:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 131f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 130f\n" + "129:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "cmp x26, #0x20\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + "bge 129b\n" + "130:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x23, x23, #0x10\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + "131:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 138f\n" + "cmp x26, #0x8\n" + "blt 133f\n" + "132:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + "bge 132b\n" + "cbz x26, 138f\n" + "133:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #2, 135f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "tbz x26, #1, 134f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "tbz x26, #0, 137f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "b 137f\n" + "134:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 137f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "b 137f\n" + "135:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 136f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x26, #0, 137f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 137f\n" + "136:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "137:" // Height 4: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + "138:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 126b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "bge 147f\n" + "tbz x10, #3, 142f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v17.4s }, [x21], #0x10\n" + "tbz x10, #2, 140f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v18.4s }, [x21], #0x10\n" + "tbz x10, #1, 139f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d19, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v19.s }[2], [x21]\n" + "b 146f\n" + "139:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 146f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s19, [x21, #0x0]\n" + "b 146f\n" + "140:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 141f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d18, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v18.s }[2], [x21]\n" + "b 146f\n" + "141:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 146f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s18, [x21, #0x0]\n" + "b 146f\n" + "142:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 144f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "tbz x10, #1, 143f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d17, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v17.s }[2], [x21]\n" + "b 146f\n" + "143:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 146f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s17, [x21, #0x0]\n" + "b 146f\n" + "144:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 145f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "tbz x10, #0, 146f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v16.s }[2], [x21]\n" + "b 146f\n" + "145:" // Height 4: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s16, [x21, #0x0]\n" + "146:" // Height 4: Partial direct writeback: Done + "b 148f\n" + "147:" // Height 4: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q15, [x22, #0x0]\n" + "str q20, [x22, #0x10]\n" + "str q21, [x22, #0x20]\n" + "str q22, [x22, #0x30]\n" + "str q16, [x21, #0x0]\n" + "str q17, [x21, #0x10]\n" + "str q18, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "148:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 113b\n" + "b 224f\n" + "149:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "150:" // Height 5: Column loop + "tbz %x[flags], #0, 161f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 159f\n" + "tbz x10, #3, 154f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #2, 152f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "tbz x10, #1, 151f\n" + "ldr d16, [x28], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d6, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v6.s }[2], [x20]\n" + "b 158f\n" + "151:" // Height 5: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 158f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s6, [x20, #0x0]\n" + "b 158f\n" + "152:" // Height 5: Partial accumulate: partial_2_8 + "tbz x10, #1, 153f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 158f\n" + "153:" // Height 5: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 158f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 158f\n" + "154:" // Height 5: Partial accumulate: partial_4_0 + "tbz x10, #2, 156f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #1, 155f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 158f\n" + "155:" // Height 5: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 158f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "b 158f\n" + "156:" // Height 5: Partial accumulate: partial_2_0 + "tbz x10, #1, 157f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "tbz x10, #0, 158f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "b 158f\n" + "157:" // Height 5: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "158:" // Height 5: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 160f\n" + "159:" // Height 5: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q25, [x20, #0x0]\n" + "ldr q26, [x20, #0x10]\n" + "ldr q27, [x20, #0x20]\n" + "ldr q6, [x20, #0x30]\n" + "160:" // Height 5: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 162f\n" + "161:" // Height 5: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "162:" // Height 5: setup done + "mov x27, #0x0\n" + "163:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 164f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 165f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 165f\n" + "164:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "165:" // Height 5: input setup done + "cmp x26, #0x10\n" + "blt 168f\n" + "ldr q1, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "blt 167f\n" + "166:" // Height 5: Multiply loop: Main loop head + "movi v6.4s, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x21, x21, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + "cmp x26, #0x20\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" + "bge 166b\n" + "167:" // Height 5: Multiply loop: Single iteration only + "movi v6.4s, #0x0\n" + "ldr q2, [x24, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" + "168:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 175f\n" + "cmp x26, #0x8\n" + "blt 170f\n" + "169:" // Height 5: Multiply loop: Odd block loop + "movi v7.4s, #0x0\n" + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr q6, [x9, #0x0]\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a498 // ummla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49c // ummla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a499 // ummla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49d // ummla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49a // ummla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49e // ummla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49b // ummla v27.4s, v4.16b, v6.16b\n" + ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49f // ummla v31.4s, v4.16b, v7.16b\n" + "bge 169b\n" + "cbz x26, 175f\n" + "170:" // Height 5: Multiply loop: Skip odd blocks + "tbz x26, #2, 172f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "tbz x26, #1, 171f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "ld1 { v5.h }[2], [x21], #0x2\n" + "tbz x26, #0, 174f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "ld1 { v5.b }[6], [x21]\n" + "b 174f\n" + "171:" // Height 5: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 174f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "ld1 { v5.b }[4], [x21]\n" + "b 174f\n" + "172:" // Height 5: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 173f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "tbz x26, #0, 174f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x21]\n" + "b 174f\n" + "173:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x21, #0x0]\n" + "174:" // Height 5: Multiply loop: Ragged operand read: Done + "movi v6.4s, #0x0\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + "175:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 163b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "bge 184f\n" + "tbz x10, #3, 179f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v17.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #2, 177f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v18.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #1, 176f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d19, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v19.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 183f\n" + "176:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x10, #0, 183f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s19, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 183f\n" + "177:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x10, #1, 178f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d18, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v18.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 183f\n" + "178:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x10, #0, 183f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s18, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 183f\n" + "179:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x10, #2, 181f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x10, #1, 180f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d17, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v17.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 183f\n" + "180:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x10, #0, 183f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s17, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 183f\n" + "181:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x10, #1, 182f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x10, #0, 183f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v16.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 183f\n" + "182:" // Height 5: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s16, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "183:" // Height 5: Partial direct writeback: Done + "b 185f\n" + "184:" // Height 5: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q15, [x22, #0x0]\n" + "str q20, [x22, #0x10]\n" + "str q21, [x22, #0x20]\n" + "str q22, [x22, #0x30]\n" + "str q16, [x21, #0x0]\n" + "str q17, [x21, #0x10]\n" + "str q18, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "185:" // Height 5: Writeback done + "subs x10, x10, #0x10\n" + "bgt 150b\n" + "b 224f\n" + "186:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "187:" // Height 6: Column loop + "tbz %x[flags], #0, 198f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "add x19, x20, x19, LSL #2\n" + "bge 196f\n" + "tbz x10, #3, 191f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "ld1 { v29.4s }, [x19], #0x10\n" + "tbz x10, #2, 189f\n" + "ld1 { v11.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v19.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v27.4s }, [x20], #0x10\n" + "ld1 { v30.4s }, [x19], #0x10\n" + "tbz x10, #1, 188f\n" + "ldr d16, [x28], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d6, [x20], #0x8\n" + "ldr d31, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v16.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v6.s }[2], [x20]\n" + "ld1 { v31.s }[2], [x19]\n" + "b 195f\n" + "188:" // Height 6: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 195f\n" + "ldr s16, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s6, [x20, #0x0]\n" + "ldr s31, [x19, #0x0]\n" + "b 195f\n" + "189:" // Height 6: Partial accumulate: partial_2_8 + "tbz x10, #1, 190f\n" + "ldr d11, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d19, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "ldr d30, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "ld1 { v30.s }[2], [x19]\n" + "b 195f\n" + "190:" // Height 6: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 195f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "ldr s30, [x19, #0x0]\n" + "b 195f\n" + "191:" // Height 6: Partial accumulate: partial_4_0 + "tbz x10, #2, 193f\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "tbz x10, #1, 192f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "ldr d29, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "ld1 { v29.s }[2], [x19]\n" + "b 195f\n" + "192:" // Height 6: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 195f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "ldr s29, [x19, #0x0]\n" + "b 195f\n" + "193:" // Height 6: Partial accumulate: partial_2_0 + "tbz x10, #1, 194f\n" + "ldr d9, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "ldr d28, [x19], #0x8\n" + "tbz x10, #0, 195f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "ld1 { v28.s }[2], [x19]\n" + "b 195f\n" + "194:" // Height 6: Partial accumulate: partial_1_0 + "ldr s9, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "ldr s28, [x19, #0x0]\n" + "195:" // Height 6: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 197f\n" + "196:" // Height 6: full accumulate + "ldr q9, [x28, #0x0]\n" + "ldr q10, [x28, #0x10]\n" + "ldr q11, [x28, #0x20]\n" + "ldr q16, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q17, [x22, #0x0]\n" + "ldr q18, [x22, #0x10]\n" + "ldr q19, [x22, #0x20]\n" + "ldr q24, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q25, [x20, #0x0]\n" + "ldr q26, [x20, #0x10]\n" + "ldr q27, [x20, #0x20]\n" + "ldr q6, [x20, #0x30]\n" + "ldr q28, [x19, #0x0]\n" + "ldr q29, [x19, #0x10]\n" + "ldr q30, [x19, #0x20]\n" + "ldr q31, [x19, #0x30]\n" + "197:" // Height 6: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 199f\n" + "198:" // Height 6: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "199:" // Height 6: setup done + "mov x27, #0x0\n" + "200:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 201f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 202f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 202f\n" + "201:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "202:" // Height 6: input setup done + "cmp x26, #0x10\n" + "blt 205f\n" + "ldr q1, [x25, #0x0]\n" + "ldr q2, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "blt 204f\n" + "203:" // Height 6: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "add x25, x25, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x24, x24, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x23, x23, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x22, x22, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x21, x21, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x20, x20, #0x10\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x20\n" + ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + "ldr q2, [x24, #0x0]\n" + ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" + "bge 203b\n" + "204:" // Height 6: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q3, [x23, #0x0]\n" + "sub x26, x26, #0x10\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q4, [x22, #0x0]\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr q5, [x21, #0x0]\n" + "add x24, x24, #0x10\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "ldr q6, [x20, #0x0]\n" + "add x23, x23, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q7, [x9, #0x0]\n" + "add x22, x22, #0x10\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + "add x21, x21, #0x10\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x20, x20, #0x10\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x80]\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x90]\n" + ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xa0]\n" + ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xb0]\n" + ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xc0]\n" + ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xd0]\n" + ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x9, #0xe0]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" + "205:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 212f\n" + "cmp x26, #0x8\n" + "blt 207f\n" + "206:" // Height 6: Multiply loop: Odd block loop + "ldr d1, [x25], #0x8\n" + "sub x26, x26, #0x8\n" + "ldr d2, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d3, [x23], #0x8\n" + "cmp x26, #0x8\n" + "ldr d4, [x22], #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d5, [x21], #0x8\n" + "ldr d7, [x20], #0x8\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a498 // ummla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49c // ummla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x30]\n" + ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a499 // ummla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x40]\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49d // ummla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x50]\n" + ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49a // ummla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x60]\n" + ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49e // ummla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49b // ummla v27.4s, v4.16b, v6.16b\n" + ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49f // ummla v31.4s, v4.16b, v7.16b\n" + "bge 206b\n" + "cbz x26, 212f\n" + "207:" // Height 6: Multiply loop: Skip odd blocks + "tbz x26, #2, 209f\n" + "ldr s1, [x25], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "ldr s6, [x20], #0x4\n" + "tbz x26, #1, 208f\n" + "ld1 { v1.h }[2], [x25], #0x2\n" + "ld1 { v2.h }[2], [x24], #0x2\n" + "ld1 { v3.h }[2], [x23], #0x2\n" + "ld1 { v4.h }[2], [x22], #0x2\n" + "ld1 { v5.h }[2], [x21], #0x2\n" + "ld1 { v6.h }[2], [x20], #0x2\n" + "tbz x26, #0, 211f\n" + "ld1 { v1.b }[6], [x25]\n" + "ld1 { v2.b }[6], [x24]\n" + "ld1 { v3.b }[6], [x23]\n" + "ld1 { v4.b }[6], [x22]\n" + "ld1 { v5.b }[6], [x21]\n" + "ld1 { v6.b }[6], [x20]\n" + "b 211f\n" + "208:" // Height 6: Multiply loop: Ragged operand read: partial_1_4 + "tbz x26, #0, 211f\n" + "ld1 { v1.b }[4], [x25]\n" + "ld1 { v2.b }[4], [x24]\n" + "ld1 { v3.b }[4], [x23]\n" + "ld1 { v4.b }[4], [x22]\n" + "ld1 { v5.b }[4], [x21]\n" + "ld1 { v6.b }[4], [x20]\n" + "b 211f\n" + "209:" // Height 6: Multiply loop: Ragged operand read: partial_2_0 + "tbz x26, #1, 210f\n" + "ldr h1, [x25], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "ldr h6, [x20], #0x2\n" + "tbz x26, #0, 211f\n" + "ld1 { v1.b }[2], [x25]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x21]\n" + "ld1 { v6.b }[2], [x20]\n" + "b 211f\n" + "210:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b1, [x25, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x21, #0x0]\n" + "ldr b6, [x20, #0x0]\n" + "211:" // Height 6: Multiply loop: Ragged operand read: Done + "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr q7, [x9, #0x0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x30]\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x40]\n" + ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x50]\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" + ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + "212:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 200b\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19, LSL #2\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x19, #0x0]\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "bge 221f\n" + "tbz x10, #3, 216f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v17.4s }, [x21], #0x10\n" + "st1 { v23.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "st1 { v24.4s }, [x19], #0x10\n" + "st1 { v25.4s }, [x19], #0x10\n" + "tbz x10, #2, 214f\n" + "st1 { v13.4s }, [x28], #0x10\n" + "st1 { v10.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v18.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x20], #0x10\n" + "st1 { v26.4s }, [x19], #0x10\n" + "tbz x10, #1, 213f\n" + "str d14, [x28], #0x8\n" + "str d11, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d19, [x21], #0x8\n" + "str d30, [x20], #0x8\n" + "str d27, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v14.s }[2], [x28]\n" + "st1 { v11.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v19.s }[2], [x21]\n" + "st1 { v30.s }[2], [x20]\n" + "st1 { v27.s }[2], [x19]\n" + "b 220f\n" + "213:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x10, #0, 220f\n" + "str s14, [x28, #0x0]\n" + "str s11, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s19, [x21, #0x0]\n" + "str s30, [x20, #0x0]\n" + "str s27, [x19, #0x0]\n" + "b 220f\n" + "214:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x10, #1, 215f\n" + "str d13, [x28], #0x8\n" + "str d10, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d18, [x21], #0x8\n" + "str d29, [x20], #0x8\n" + "str d26, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v13.s }[2], [x28]\n" + "st1 { v10.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v18.s }[2], [x21]\n" + "st1 { v29.s }[2], [x20]\n" + "st1 { v26.s }[2], [x19]\n" + "b 220f\n" + "215:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x10, #0, 220f\n" + "str s13, [x28, #0x0]\n" + "str s10, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s18, [x21, #0x0]\n" + "str s29, [x20, #0x0]\n" + "str s26, [x19, #0x0]\n" + "b 220f\n" + "216:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x10, #2, 218f\n" + "st1 { v7.4s }, [x28], #0x10\n" + "st1 { v8.4s }, [x23], #0x10\n" + "st1 { v15.4s }, [x22], #0x10\n" + "st1 { v16.4s }, [x21], #0x10\n" + "st1 { v23.4s }, [x20], #0x10\n" + "st1 { v24.4s }, [x19], #0x10\n" + "tbz x10, #1, 217f\n" + "str d12, [x28], #0x8\n" + "str d9, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d17, [x21], #0x8\n" + "str d28, [x20], #0x8\n" + "str d25, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v12.s }[2], [x28]\n" + "st1 { v9.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v17.s }[2], [x21]\n" + "st1 { v28.s }[2], [x20]\n" + "st1 { v25.s }[2], [x19]\n" + "b 220f\n" + "217:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x10, #0, 220f\n" + "str s12, [x28, #0x0]\n" + "str s9, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s17, [x21, #0x0]\n" + "str s28, [x20, #0x0]\n" + "str s25, [x19, #0x0]\n" + "b 220f\n" + "218:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x10, #1, 219f\n" + "str d7, [x28], #0x8\n" + "str d8, [x23], #0x8\n" + "str d15, [x22], #0x8\n" + "str d16, [x21], #0x8\n" + "str d23, [x20], #0x8\n" + "str d24, [x19], #0x8\n" + "tbz x10, #0, 220f\n" + "st1 { v7.s }[2], [x28]\n" + "st1 { v8.s }[2], [x23]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v16.s }[2], [x21]\n" + "st1 { v23.s }[2], [x20]\n" + "st1 { v24.s }[2], [x19]\n" + "b 220f\n" + "219:" // Height 6: Partial direct writeback: partial_1_0 + "str s7, [x28, #0x0]\n" + "str s8, [x23, #0x0]\n" + "str s15, [x22, #0x0]\n" + "str s16, [x21, #0x0]\n" + "str s23, [x20, #0x0]\n" + "str s24, [x19, #0x0]\n" + "220:" // Height 6: Partial direct writeback: Done + "b 222f\n" + "221:" // Height 6: Full writeback + "str q7, [x28, #0x0]\n" + "str q12, [x28, #0x10]\n" + "str q13, [x28, #0x20]\n" + "str q14, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q8, [x23, #0x0]\n" + "str q9, [x23, #0x10]\n" + "str q10, [x23, #0x20]\n" + "str q11, [x23, #0x30]\n" + "str q15, [x22, #0x0]\n" + "str q20, [x22, #0x10]\n" + "str q21, [x22, #0x20]\n" + "str q22, [x22, #0x30]\n" + "str q16, [x21, #0x0]\n" + "str q17, [x21, #0x10]\n" + "str q18, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "str q23, [x20, #0x0]\n" + "str q28, [x20, #0x10]\n" + "str q29, [x20, #0x20]\n" + "str q30, [x20, #0x30]\n" + "str q24, [x19, #0x0]\n" + "str q25, [x19, #0x10]\n" + "str q26, [x19, #0x20]\n" + "str q27, [x19, #0x30]\n" + "222:" // Height 6: Writeback done + "subs x10, x10, #0x10\n" + "bgt 187b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 224f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 223f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "223:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "224:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp index 2fea5ad2e7..153a4cc167 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,63 +10,92 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #pragma once #ifdef __aarch64__ - -#include "../bfloat.hpp" #include "../std_transforms_fixed.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const bfloat16 *, const bfloat16 *, \ + float *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int); +void a64_interleaved_bf16fp32_dot_8x12( ARGLIST ); -class cls_a64_interleaved_bf16fp32_dot_8x12 { +class cls_a64_interleaved_bf16fp32_dot_8x12 +{ public: typedef bfloat16 operand_type; typedef float result_type; - typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return 12; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return 4; } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 2; } - // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=a64_interleaved_bf16fp32_dot_8x12; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 15.93, 4.16, 7.19 }; + case CPUModel::V1: + return { 20.88, 5.10, 6.57 }; + case CPUModel::A510: + return { 7.77, 3.69, 3.02 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=a64_interleaved_bf16fp32_dot_8x12; cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp index 92149a5579..5689f89781 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,305 +23,231 @@ */ #ifdef __aarch64__ +#include #include "../../bfloat.hpp" -#include "../../asmlib.hpp" namespace arm_gemm { -void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; +void a64_interleaved_bf16fp32_dot_8x12( + const bfloat16 *Apanel, const bfloat16 *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { - K /= 2; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const bfloat16 *Bpanel = {}; + } ka; - for (int yb=0; yb +#include "../../bfloat.hpp" + +namespace arm_gemm { + +void a64_interleaved_bf16fp32_dot_8x12_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb transforms = {}; + StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.54, 4.30, 7.33 }; + case CPUModel::V1: + return { 59.94, 5.08, 9.83 }; + case CPUModel::A510: + return { 7.82, 4.05, 3.07 }; + } + } - kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.15, 2.51, 5.25 }; + case CPUModel::V1: + return { 59.44, 3.18, 7.26 }; + case CPUModel::A510: + return { 7.83, 2.53, 2.71 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12; cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp index c476fcf171..94c72a31c9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,406 +23,269 @@ */ #ifdef __aarch64__ +#include #include "../../bfloat.hpp" -#include "../../asmlib.hpp" namespace arm_gemm { -void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; +void a64_interleaved_bf16fp32_mmla_8x12( + const bfloat16 *Apanel, const bfloat16 *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const bfloat16 *Bpanel = {}; + } ka; - for (int yb=0; yb transforms = {}; + StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 7.16, 1.14, 0.67 }; + default: + return { 12.67, 3.98, 1.16 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_interleaved_fp16_mla_8x24; + cls_a64_interleaved_fp16_mla_8x24(const CPUInfo *ci) + { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_interleaved_fp16_mla_8x24_a55; + break; + case CPUModel::X1: + kernel=a64_interleaved_fp16_mla_8x24_x1; + break; + } + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp new file mode 100644 index 0000000000..49500f2d18 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include + +namespace arm_gemm { + +void a64_interleaved_fp16_mla_8x24_a55( + const __fp16 *Apanel, const __fp16 *Bpanel, + __fp16 *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const __fp16 *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x10, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x9, %x[Apanel]\n" + "ldr x28, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x27, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x9\n" + "cmp x27, #0x2\n" + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.16b, #0x0\n" + "prfm pldl1keep, [x28, #0x0]\n" + "movi v11.16b, #0x0\n" + "prfm pldl1keep, [x28, #0x40]\n" + "movi v12.16b, #0x0\n" + "prfm pldl1keep, [x28, #0x80]\n" + "movi v13.16b, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v14.16b, #0x0\n" + "ldr q2, [x28, #0x0]\n" + "movi v15.16b, #0x0\n" + "ldr q3, [x28, #0x10]\n" + "movi v16.16b, #0x0\n" + "ldr q4, [x28, #0x20]\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "ldr d1, [%x[Apanel], #0x10]\n" + "fmla v8.8h, v2.8h, v0.h[0]\n" + "ldr x26, [%x[Apanel], #0x18]\n" + "fmla v11.8h, v2.8h, v0.h[1]\n" + "ldr d5, [x28, #0x30]\n" + "fmla v14.8h, v2.8h, v0.h[2]\n" + "ldr x25, [x28, #0x38]\n" + "fmla v17.8h, v2.8h, v0.h[3]\n" + "ldr d6, [x28, #0x40]\n" + "fmla v20.8h, v2.8h, v0.h[4]\n" + "ldr x24, [x28, #0x48]\n" + "fmla v23.8h, v2.8h, v0.h[5]\n" + "ldr d7, [x28, #0x50]\n" + "fmla v26.8h, v2.8h, v0.h[6]\n" + "ldr x23, [x28, #0x58]\n" + "fmla v29.8h, v2.8h, v0.h[7]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla v9.8h, v3.8h, v0.h[0]\n" + "prfm pldl1keep, [x28, #0x100]\n" + "fmla v12.8h, v3.8h, v0.h[1]\n" + "prfm pldl1keep, [x28, #0x140]\n" + "fmla v15.8h, v3.8h, v0.h[2]\n" + "add x28, x28, #0x60\n" + "fmla v18.8h, v3.8h, v0.h[3]\n" + "ldr d2, [x28, #0x0]\n" + "fmla v21.8h, v3.8h, v0.h[4]\n" + "ldr x22, [x28, #0x8]\n" + "fmla v24.8h, v3.8h, v0.h[5]\n" + "ldr x21, [x28, #0x18]\n" + "fmla v27.8h, v3.8h, v0.h[6]\n" + "ldr x20, [%x[Apanel], #0x8]\n" + "fmla v30.8h, v3.8h, v0.h[7]\n" + "ldr d3, [x28, #0x10]\n" + "fmla v10.8h, v4.8h, v0.h[0]\n" + "ldr x19, [x28, #0x28]\n" + "fmla v13.8h, v4.8h, v0.h[1]\n" + "mov v1.d[1], x26\n" + "fmla v16.8h, v4.8h, v0.h[2]\n" + "mov v5.d[1], x25\n" + "fmla v19.8h, v4.8h, v0.h[3]\n" + "mov v6.d[1], x24\n" + "fmla v22.8h, v4.8h, v0.h[4]\n" + "mov v7.d[1], x23\n" + "fmla v25.8h, v4.8h, v0.h[5]\n" + "sub x27, x27, #0x2\n" + "fmla v28.8h, v4.8h, v0.h[6]\n" + "cmp x27, #0x2\n" + "fmla v31.8h, v4.8h, v0.h[7]\n" + "ldr d0, [%x[Apanel], #0x0]\n" + "ldr d4, [x28, #0x20]\n" + "mov v2.d[1], x22\n" + "mov v3.d[1], x21\n" + "fmla v8.8h, v5.8h, v1.h[0]\n" + "mov v0.d[1], x20\n" + "fmla v11.8h, v5.8h, v1.h[1]\n" + "mov v4.d[1], x19\n" + "fmla v14.8h, v5.8h, v1.h[2]\n" + "fmla v17.8h, v5.8h, v1.h[3]\n" + "fmla v20.8h, v5.8h, v1.h[4]\n" + "fmla v23.8h, v5.8h, v1.h[5]\n" + "fmla v26.8h, v5.8h, v1.h[6]\n" + "fmla v29.8h, v5.8h, v1.h[7]\n" + "fmla v9.8h, v6.8h, v1.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v15.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v1.h[3]\n" + "fmla v21.8h, v6.8h, v1.h[4]\n" + "fmla v24.8h, v6.8h, v1.h[5]\n" + "fmla v27.8h, v6.8h, v1.h[6]\n" + "fmla v30.8h, v6.8h, v1.h[7]\n" + "fmla v10.8h, v7.8h, v1.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v16.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v1.h[3]\n" + "fmla v22.8h, v7.8h, v1.h[4]\n" + "fmla v25.8h, v7.8h, v1.h[5]\n" + "fmla v28.8h, v7.8h, v1.h[6]\n" + "fmla v31.8h, v7.8h, v1.h[7]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x10\n" + "fmla v8.8h, v2.8h, v0.h[0]\n" + "add x28, x28, #0x30\n" + "fmla v11.8h, v2.8h, v0.h[1]\n" + "fmla v14.8h, v2.8h, v0.h[2]\n" + "fmla v17.8h, v2.8h, v0.h[3]\n" + "fmla v20.8h, v2.8h, v0.h[4]\n" + "fmla v23.8h, v2.8h, v0.h[5]\n" + "fmla v26.8h, v2.8h, v0.h[6]\n" + "fmla v29.8h, v2.8h, v0.h[7]\n" + "fmla v9.8h, v3.8h, v0.h[0]\n" + "fmla v12.8h, v3.8h, v0.h[1]\n" + "fmla v15.8h, v3.8h, v0.h[2]\n" + "fmla v18.8h, v3.8h, v0.h[3]\n" + "fmla v21.8h, v3.8h, v0.h[4]\n" + "fmla v24.8h, v3.8h, v0.h[5]\n" + "fmla v27.8h, v3.8h, v0.h[6]\n" + "fmla v30.8h, v3.8h, v0.h[7]\n" + "fmla v10.8h, v4.8h, v0.h[0]\n" + "fmla v13.8h, v4.8h, v0.h[1]\n" + "fmla v16.8h, v4.8h, v0.h[2]\n" + "fmla v19.8h, v4.8h, v0.h[3]\n" + "fmla v22.8h, v4.8h, v0.h[4]\n" + "fmla v25.8h, v4.8h, v0.h[5]\n" + "fmla v28.8h, v4.8h, v0.h[6]\n" + "fmla v31.8h, v4.8h, v0.h[7]\n" + "cbz x27, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "add %x[Apanel], %x[Apanel], #0x10\n" + "ldr q5, [x28, #0x0]\n" + "fmla v8.8h, v5.8h, v0.h[0]\n" + "ldr q6, [x28, #0x10]\n" + "fmla v11.8h, v5.8h, v0.h[1]\n" + "ldr q7, [x28, #0x20]\n" + "fmla v14.8h, v5.8h, v0.h[2]\n" + "fmla v17.8h, v5.8h, v0.h[3]\n" + "add x28, x28, #0x30\n" + "fmla v20.8h, v5.8h, v0.h[4]\n" + "fmla v23.8h, v5.8h, v0.h[5]\n" + "fmla v26.8h, v5.8h, v0.h[6]\n" + "fmla v29.8h, v5.8h, v0.h[7]\n" + "fmla v9.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v0.h[1]\n" + "fmla v15.8h, v6.8h, v0.h[2]\n" + "fmla v18.8h, v6.8h, v0.h[3]\n" + "fmla v21.8h, v6.8h, v0.h[4]\n" + "fmla v24.8h, v6.8h, v0.h[5]\n" + "fmla v27.8h, v6.8h, v0.h[6]\n" + "fmla v30.8h, v6.8h, v0.h[7]\n" + "fmla v10.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v0.h[1]\n" + "fmla v16.8h, v7.8h, v0.h[2]\n" + "fmla v19.8h, v7.8h, v0.h[3]\n" + "fmla v22.8h, v7.8h, v0.h[4]\n" + "fmla v25.8h, v7.8h, v0.h[5]\n" + "fmla v28.8h, v7.8h, v0.h[6]\n" + "fmla v31.8h, v7.8h, v0.h[7]\n" + "5:" // multiply loop done + "subs x10, x10, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp new file mode 100644 index 0000000000..a9da6956ed --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include + +namespace arm_gemm { + +void a64_interleaved_fp16_mla_8x24( + const __fp16 *Apanel, const __fp16 *Bpanel, + __fp16 *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const __fp16 *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "ldr q2, [x20, #0x0]\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "ldr q3, [x20, #0x10]\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "ldr q4, [x20, #0x20]\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "fmla v8.8h, v2.8h, v0.h[0]\n" + "fmla v11.8h, v2.8h, v0.h[1]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "fmla v14.8h, v2.8h, v0.h[2]\n" + "fmla v17.8h, v2.8h, v0.h[3]\n" + "ldr q5, [x20, #0x30]\n" + "fmla v20.8h, v2.8h, v0.h[4]\n" + "fmla v23.8h, v2.8h, v0.h[5]\n" + "ldr q6, [x20, #0x40]\n" + "fmla v26.8h, v2.8h, v0.h[6]\n" + "fmla v29.8h, v2.8h, v0.h[7]\n" + "ldr q7, [x20, #0x50]\n" + "fmla v9.8h, v3.8h, v0.h[0]\n" + "fmla v12.8h, v3.8h, v0.h[1]\n" + "sub x19, x19, #0x2\n" + "fmla v15.8h, v3.8h, v0.h[2]\n" + "fmla v18.8h, v3.8h, v0.h[3]\n" + "cmp x19, #0x2\n" + "fmla v21.8h, v3.8h, v0.h[4]\n" + "fmla v24.8h, v3.8h, v0.h[5]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + "fmla v27.8h, v3.8h, v0.h[6]\n" + "fmla v30.8h, v3.8h, v0.h[7]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla v10.8h, v4.8h, v0.h[0]\n" + "fmla v13.8h, v4.8h, v0.h[1]\n" + "prfm pldl1keep, [x20, #0x100]\n" + "fmla v16.8h, v4.8h, v0.h[2]\n" + "fmla v19.8h, v4.8h, v0.h[3]\n" + "prfm pldl1keep, [x20, #0x140]\n" + "fmla v22.8h, v4.8h, v0.h[4]\n" + "fmla v25.8h, v4.8h, v0.h[5]\n" + "add x20, x20, #0x60\n" + "fmla v28.8h, v4.8h, v0.h[6]\n" + "fmla v31.8h, v4.8h, v0.h[7]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "fmla v8.8h, v5.8h, v1.h[0]\n" + "fmla v11.8h, v5.8h, v1.h[1]\n" + "ldr q2, [x20, #0x0]\n" + "fmla v14.8h, v5.8h, v1.h[2]\n" + "fmla v17.8h, v5.8h, v1.h[3]\n" + "ldr q3, [x20, #0x10]\n" + "fmla v20.8h, v5.8h, v1.h[4]\n" + "fmla v23.8h, v5.8h, v1.h[5]\n" + "ldr q4, [x20, #0x20]\n" + "fmla v26.8h, v5.8h, v1.h[6]\n" + "fmla v29.8h, v5.8h, v1.h[7]\n" + "fmla v9.8h, v6.8h, v1.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v15.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v1.h[3]\n" + "fmla v21.8h, v6.8h, v1.h[4]\n" + "fmla v24.8h, v6.8h, v1.h[5]\n" + "fmla v27.8h, v6.8h, v1.h[6]\n" + "fmla v30.8h, v6.8h, v1.h[7]\n" + "fmla v10.8h, v7.8h, v1.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v16.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v1.h[3]\n" + "fmla v22.8h, v7.8h, v1.h[4]\n" + "fmla v25.8h, v7.8h, v1.h[5]\n" + "fmla v28.8h, v7.8h, v1.h[6]\n" + "fmla v31.8h, v7.8h, v1.h[7]\n" + "bge 3b\n" + "4:" // main loop skip + "fmla v8.8h, v2.8h, v0.h[0]\n" + "fmla v11.8h, v2.8h, v0.h[1]\n" + "add %x[Apanel], %x[Apanel], #0x10\n" + "fmla v14.8h, v2.8h, v0.h[2]\n" + "fmla v17.8h, v2.8h, v0.h[3]\n" + "add x20, x20, #0x30\n" + "fmla v20.8h, v2.8h, v0.h[4]\n" + "fmla v23.8h, v2.8h, v0.h[5]\n" + "fmla v26.8h, v2.8h, v0.h[6]\n" + "fmla v29.8h, v2.8h, v0.h[7]\n" + "fmla v9.8h, v3.8h, v0.h[0]\n" + "fmla v12.8h, v3.8h, v0.h[1]\n" + "fmla v15.8h, v3.8h, v0.h[2]\n" + "fmla v18.8h, v3.8h, v0.h[3]\n" + "fmla v21.8h, v3.8h, v0.h[4]\n" + "fmla v24.8h, v3.8h, v0.h[5]\n" + "fmla v27.8h, v3.8h, v0.h[6]\n" + "fmla v30.8h, v3.8h, v0.h[7]\n" + "fmla v10.8h, v4.8h, v0.h[0]\n" + "fmla v13.8h, v4.8h, v0.h[1]\n" + "fmla v16.8h, v4.8h, v0.h[2]\n" + "fmla v19.8h, v4.8h, v0.h[3]\n" + "fmla v22.8h, v4.8h, v0.h[4]\n" + "fmla v25.8h, v4.8h, v0.h[5]\n" + "fmla v28.8h, v4.8h, v0.h[6]\n" + "fmla v31.8h, v4.8h, v0.h[7]\n" + "cbz x19, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "fmla v8.8h, v5.8h, v0.h[0]\n" + "ldr q6, [x20, #0x10]\n" + "ldr q7, [x20, #0x20]\n" + "fmla v11.8h, v5.8h, v0.h[1]\n" + "fmla v14.8h, v5.8h, v0.h[2]\n" + "fmla v17.8h, v5.8h, v0.h[3]\n" + "add %x[Apanel], %x[Apanel], #0x10\n" + "fmla v20.8h, v5.8h, v0.h[4]\n" + "fmla v23.8h, v5.8h, v0.h[5]\n" + "add x20, x20, #0x30\n" + "fmla v26.8h, v5.8h, v0.h[6]\n" + "fmla v29.8h, v5.8h, v0.h[7]\n" + "fmla v9.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v0.h[1]\n" + "fmla v15.8h, v6.8h, v0.h[2]\n" + "fmla v18.8h, v6.8h, v0.h[3]\n" + "fmla v21.8h, v6.8h, v0.h[4]\n" + "fmla v24.8h, v6.8h, v0.h[5]\n" + "fmla v27.8h, v6.8h, v0.h[6]\n" + "fmla v30.8h, v6.8h, v0.h[7]\n" + "fmla v10.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v0.h[1]\n" + "fmla v16.8h, v7.8h, v0.h[2]\n" + "fmla v19.8h, v7.8h, v0.h[3]\n" + "fmla v22.8h, v7.8h, v0.h[4]\n" + "fmla v25.8h, v7.8h, v0.h[5]\n" + "fmla v28.8h, v7.8h, v0.h[6]\n" + "fmla v31.8h, v7.8h, v0.h[7]\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp new file mode 100644 index 0000000000..efaedeb33f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include + +namespace arm_gemm { + +void a64_interleaved_fp16_mla_8x24_x1( + const __fp16 *Apanel, const __fp16 *Bpanel, + __fp16 *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const __fp16 *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "ldr q1, [x20, #0x0]\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "ldr q2, [x20, #0x10]\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "ldr q3, [x20, #0x20]\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "fmla v8.8h, v1.8h, v0.h[0]\n" + "fmla v11.8h, v1.8h, v0.h[1]\n" + "sub x19, x19, #0x2\n" + "fmla v14.8h, v1.8h, v0.h[2]\n" + "fmla v17.8h, v1.8h, v0.h[3]\n" + "cmp x19, #0x2\n" + "fmla v20.8h, v1.8h, v0.h[4]\n" + "fmla v23.8h, v1.8h, v0.h[5]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + "fmla v26.8h, v1.8h, v0.h[6]\n" + "fmla v29.8h, v1.8h, v0.h[7]\n" + "ldr q1, [x20, #0x30]\n" + "fmla v9.8h, v2.8h, v0.h[0]\n" + "fmla v12.8h, v2.8h, v0.h[1]\n" + "prfm pldl1keep, [x20, #0x100]\n" + "fmla v15.8h, v2.8h, v0.h[2]\n" + "fmla v18.8h, v2.8h, v0.h[3]\n" + "prfm pldl1keep, [x20, #0x140]\n" + "fmla v21.8h, v2.8h, v0.h[4]\n" + "fmla v24.8h, v2.8h, v0.h[5]\n" + "fmla v27.8h, v2.8h, v0.h[6]\n" + "fmla v30.8h, v2.8h, v0.h[7]\n" + "ldr q2, [x20, #0x40]\n" + "fmla v10.8h, v3.8h, v0.h[0]\n" + "fmla v13.8h, v3.8h, v0.h[1]\n" + "fmla v16.8h, v3.8h, v0.h[2]\n" + "fmla v19.8h, v3.8h, v0.h[3]\n" + "fmla v22.8h, v3.8h, v0.h[4]\n" + "fmla v25.8h, v3.8h, v0.h[5]\n" + "fmla v28.8h, v3.8h, v0.h[6]\n" + "fmla v31.8h, v3.8h, v0.h[7]\n" + "ldr q0, [%x[Apanel], #0x10]\n" + "ldr q3, [x20, #0x50]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "add x20, x20, #0x60\n" + "fmla v8.8h, v1.8h, v0.h[0]\n" + "fmla v11.8h, v1.8h, v0.h[1]\n" + "fmla v14.8h, v1.8h, v0.h[2]\n" + "fmla v17.8h, v1.8h, v0.h[3]\n" + "fmla v20.8h, v1.8h, v0.h[4]\n" + "fmla v23.8h, v1.8h, v0.h[5]\n" + "fmla v26.8h, v1.8h, v0.h[6]\n" + "fmla v29.8h, v1.8h, v0.h[7]\n" + "ldr q1, [x20, #0x0]\n" + "fmla v9.8h, v2.8h, v0.h[0]\n" + "fmla v12.8h, v2.8h, v0.h[1]\n" + "fmla v15.8h, v2.8h, v0.h[2]\n" + "fmla v18.8h, v2.8h, v0.h[3]\n" + "fmla v21.8h, v2.8h, v0.h[4]\n" + "fmla v24.8h, v2.8h, v0.h[5]\n" + "fmla v27.8h, v2.8h, v0.h[6]\n" + "fmla v30.8h, v2.8h, v0.h[7]\n" + "ldr q2, [x20, #0x10]\n" + "fmla v10.8h, v3.8h, v0.h[0]\n" + "fmla v13.8h, v3.8h, v0.h[1]\n" + "fmla v16.8h, v3.8h, v0.h[2]\n" + "fmla v19.8h, v3.8h, v0.h[3]\n" + "fmla v22.8h, v3.8h, v0.h[4]\n" + "fmla v25.8h, v3.8h, v0.h[5]\n" + "fmla v28.8h, v3.8h, v0.h[6]\n" + "fmla v31.8h, v3.8h, v0.h[7]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q3, [x20, #0x20]\n" + "bge 3b\n" + "4:" // main loop skip + "fmla v8.8h, v1.8h, v0.h[0]\n" + "fmla v11.8h, v1.8h, v0.h[1]\n" + "add %x[Apanel], %x[Apanel], #0x10\n" + "fmla v14.8h, v1.8h, v0.h[2]\n" + "fmla v17.8h, v1.8h, v0.h[3]\n" + "add x20, x20, #0x30\n" + "fmla v20.8h, v1.8h, v0.h[4]\n" + "fmla v23.8h, v1.8h, v0.h[5]\n" + "fmla v26.8h, v1.8h, v0.h[6]\n" + "fmla v29.8h, v1.8h, v0.h[7]\n" + "fmla v9.8h, v2.8h, v0.h[0]\n" + "fmla v12.8h, v2.8h, v0.h[1]\n" + "fmla v15.8h, v2.8h, v0.h[2]\n" + "fmla v18.8h, v2.8h, v0.h[3]\n" + "fmla v21.8h, v2.8h, v0.h[4]\n" + "fmla v24.8h, v2.8h, v0.h[5]\n" + "fmla v27.8h, v2.8h, v0.h[6]\n" + "fmla v30.8h, v2.8h, v0.h[7]\n" + "fmla v10.8h, v3.8h, v0.h[0]\n" + "fmla v13.8h, v3.8h, v0.h[1]\n" + "fmla v16.8h, v3.8h, v0.h[2]\n" + "fmla v19.8h, v3.8h, v0.h[3]\n" + "fmla v22.8h, v3.8h, v0.h[4]\n" + "fmla v25.8h, v3.8h, v0.h[5]\n" + "fmla v28.8h, v3.8h, v0.h[6]\n" + "fmla v31.8h, v3.8h, v0.h[7]\n" + "cbz x19, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q4, [x20, #0x0]\n" + "fmla v8.8h, v4.8h, v0.h[0]\n" + "ldr q5, [x20, #0x10]\n" + "ldr q6, [x20, #0x20]\n" + "fmla v11.8h, v4.8h, v0.h[1]\n" + "fmla v14.8h, v4.8h, v0.h[2]\n" + "fmla v17.8h, v4.8h, v0.h[3]\n" + "add %x[Apanel], %x[Apanel], #0x10\n" + "fmla v20.8h, v4.8h, v0.h[4]\n" + "fmla v23.8h, v4.8h, v0.h[5]\n" + "add x20, x20, #0x30\n" + "fmla v26.8h, v4.8h, v0.h[6]\n" + "fmla v29.8h, v4.8h, v0.h[7]\n" + "fmla v9.8h, v5.8h, v0.h[0]\n" + "fmla v12.8h, v5.8h, v0.h[1]\n" + "fmla v15.8h, v5.8h, v0.h[2]\n" + "fmla v18.8h, v5.8h, v0.h[3]\n" + "fmla v21.8h, v5.8h, v0.h[4]\n" + "fmla v24.8h, v5.8h, v0.h[5]\n" + "fmla v27.8h, v5.8h, v0.h[6]\n" + "fmla v30.8h, v5.8h, v0.h[7]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v13.8h, v6.8h, v0.h[1]\n" + "fmla v16.8h, v6.8h, v0.h[2]\n" + "fmla v19.8h, v6.8h, v0.h[3]\n" + "fmla v22.8h, v6.8h, v0.h[4]\n" + "fmla v25.8h, v6.8h, v0.h[5]\n" + "fmla v28.8h, v6.8h, v0.h[6]\n" + "fmla v31.8h, v6.8h, v0.h[7]\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp new file mode 100644 index 0000000000..465a5b4e0f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + const float *, const float *, \ + float *, int, int, int + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_interleaved_fp32_mla_8x12( ARGLIST ); +void a64_interleaved_fp32_mla_8x12_a55( ARGLIST ); +void a64_interleaved_fp32_mla_8x12_x1( ARGLIST ); + +class cls_a64_interleaved_fp32_mla_8x12 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return 12; + } + + static unsigned int stripe_width() + { + return 4; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 3.954, 1.252, 1.141 }; + default: + return { 7.2307, 3.876, 2.932 }; + case CPUModel::A73: + return { 2.885, 1.429, 1.163 }; + case CPUModel::A53: + return { 2.7, 0.9, 0.8 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_interleaved_fp32_mla_8x12; + cls_a64_interleaved_fp32_mla_8x12(const CPUInfo *ci) + { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + case CPUModel::A53: + kernel=a64_interleaved_fp32_mla_8x12_a55; + break; + case CPUModel::X1: + kernel=a64_interleaved_fp32_mla_8x12_x1; + break; + } + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp new file mode 100644 index 0000000000..46d9ff73b9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +namespace arm_gemm { + +void a64_interleaved_fp32_mla_8x12_a55( + const float *Apanel, const float *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const float *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x28, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x27, %x[Apanel]\n" + "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x25, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x27\n" + "cmp x25, #0x4\n" + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.16b, #0x0\n" + "prfm pldl1keep, [x26, #0x0]\n" + "movi v11.16b, #0x0\n" + "prfm pldl1keep, [x26, #0x40]\n" + "movi v12.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v13.16b, #0x0\n" + "prfm pldl1keep, [x26, #0x80]\n" + "movi v14.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + "movi v15.16b, #0x0\n" + "prfm pldl1keep, [x26, #0xc0]\n" + "movi v16.16b, #0x0\n" + "prfm pldl1keep, [x26, #0x100]\n" + "movi v17.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0xc0]\n" + "movi v18.16b, #0x0\n" + "prfm pldl1keep, [x26, #0x140]\n" + "movi v19.16b, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v20.16b, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "movi v21.16b, #0x0\n" + "ldr q4, [x26, #0x0]\n" + "movi v22.16b, #0x0\n" + "ldr q5, [x26, #0x10]\n" + "movi v23.16b, #0x0\n" + "ldr q6, [x26, #0x20]\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "ldr d2, [%x[Apanel], #0x20]\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr x21, [%x[Apanel], #0x28]\n" + "fmla v11.4s, v4.4s, v0.s[1]\n" + "ldr d3, [%x[Apanel], #0x30]\n" + "fmla v14.4s, v4.4s, v0.s[2]\n" + "ldr x20, [%x[Apanel], #0x38]\n" + "fmla v17.4s, v4.4s, v0.s[3]\n" + "ldr d7, [x26, #0x30]\n" + "fmla v20.4s, v4.4s, v1.s[0]\n" + "ldr x24, [x26, #0x38]\n" + "fmla v23.4s, v4.4s, v1.s[1]\n" + "fmla v26.4s, v4.4s, v1.s[2]\n" + "ldr x23, [x26, #0x48]\n" + "fmla v29.4s, v4.4s, v1.s[3]\n" + "ldr d4, [x26, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "mov v2.d[1], x21\n" + "fmla v12.4s, v5.4s, v0.s[1]\n" + "mov v3.d[1], x20\n" + "fmla v15.4s, v5.4s, v0.s[2]\n" + "mov v7.d[1], x24\n" + "fmla v18.4s, v5.4s, v0.s[3]\n" + "mov v4.d[1], x23\n" + "fmla v21.4s, v5.4s, v1.s[0]\n" + "ldr x22, [x26, #0x58]\n" + "fmla v24.4s, v5.4s, v1.s[1]\n" + "ldr x21, [%x[Apanel], #0x48]\n" + "fmla v27.4s, v5.4s, v1.s[2]\n" + "ldr x20, [%x[Apanel], #0x58]\n" + "fmla v30.4s, v5.4s, v1.s[3]\n" + "ldr d5, [x26, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr x19, [x26, #0x68]\n" + "fmla v13.4s, v6.4s, v0.s[1]\n" + "ldr x24, [x26, #0x78]\n" + "fmla v16.4s, v6.4s, v0.s[2]\n" + "mov v5.d[1], x22\n" + "fmla v19.4s, v6.4s, v0.s[3]\n" + "ldr d0, [%x[Apanel], #0x40]\n" + "fmla v22.4s, v6.4s, v1.s[0]\n" + "mov v0.d[1], x21\n" + "fmla v25.4s, v6.4s, v1.s[1]\n" + "ldr x23, [x26, #0x88]\n" + "fmla v28.4s, v6.4s, v1.s[2]\n" + "ldr x21, [%x[Apanel], #0x68]\n" + "fmla v31.4s, v6.4s, v1.s[3]\n" + "ldr d1, [%x[Apanel], #0x50]\n" + "ldr d6, [x26, #0x60]\n" + "fmla v8.4s, v7.4s, v2.s[0]\n" + "fmla v11.4s, v7.4s, v2.s[1]\n" + "mov v1.d[1], x20\n" + "fmla v14.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x19\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "ldr x20, [%x[Apanel], #0x78]\n" + "fmla v20.4s, v7.4s, v3.s[0]\n" + "ldr x22, [x26, #0x98]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "ldr x19, [x26, #0xa8]\n" + "fmla v26.4s, v7.4s, v3.s[2]\n" + "prfm pldl1keep, [%x[Apanel], #0x100]\n" + "fmla v29.4s, v7.4s, v3.s[3]\n" + "ldr d7, [x26, #0x70]\n" + "fmla v9.4s, v4.4s, v2.s[0]\n" + "mov v7.d[1], x24\n" + "fmla v12.4s, v4.4s, v2.s[1]\n" + "ldr x24, [x26, #0xb8]\n" + "fmla v15.4s, v4.4s, v2.s[2]\n" + "prfm pldl1keep, [x26, #0x180]\n" + "fmla v18.4s, v4.4s, v2.s[3]\n" + "prfm pldl1keep, [x26, #0x1c0]\n" + "fmla v21.4s, v4.4s, v3.s[0]\n" + "prfm pldl1keep, [%x[Apanel], #0x140]\n" + "fmla v24.4s, v4.4s, v3.s[1]\n" + "prfm pldl1keep, [x26, #0x200]\n" + "fmla v27.4s, v4.4s, v3.s[2]\n" + "sub x25, x25, #0x4\n" + "fmla v30.4s, v4.4s, v3.s[3]\n" + "ldr d4, [x26, #0x80]\n" + "fmla v10.4s, v5.4s, v2.s[0]\n" + "mov v4.d[1], x23\n" + "fmla v13.4s, v5.4s, v2.s[1]\n" + "cmp x25, #0x4\n" + "fmla v16.4s, v5.4s, v2.s[2]\n" + "fmla v19.4s, v5.4s, v2.s[3]\n" + "ldr d2, [%x[Apanel], #0x60]\n" + "fmla v22.4s, v5.4s, v3.s[0]\n" + "mov v2.d[1], x21\n" + "fmla v25.4s, v5.4s, v3.s[1]\n" + "fmla v28.4s, v5.4s, v3.s[2]\n" + "fmla v31.4s, v5.4s, v3.s[3]\n" + "ldr d3, [%x[Apanel], #0x70]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr d5, [x26, #0x90]\n" + "fmla v11.4s, v6.4s, v0.s[1]\n" + "mov v3.d[1], x20\n" + "fmla v14.4s, v6.4s, v0.s[2]\n" + "mov v5.d[1], x22\n" + "fmla v17.4s, v6.4s, v0.s[3]\n" + "add %x[Apanel], %x[Apanel], #0x80\n" + "fmla v20.4s, v6.4s, v1.s[0]\n" + "ldr x21, [%x[Apanel], #0x8]\n" + "fmla v23.4s, v6.4s, v1.s[1]\n" + "ldr x20, [%x[Apanel], #0x18]\n" + "fmla v26.4s, v6.4s, v1.s[2]\n" + "fmla v29.4s, v6.4s, v1.s[3]\n" + "ldr d6, [x26, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "mov v6.d[1], x19\n" + "fmla v12.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v0.s[2]\n" + "fmla v18.4s, v7.4s, v0.s[3]\n" + "fmla v21.4s, v7.4s, v1.s[0]\n" + "fmla v24.4s, v7.4s, v1.s[1]\n" + "fmla v27.4s, v7.4s, v1.s[2]\n" + "fmla v30.4s, v7.4s, v1.s[3]\n" + "ldr d7, [x26, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "add x26, x26, #0xc0\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "ldr x23, [x26, #0x8]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "ldr x22, [x26, #0x18]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "ldr d0, [%x[Apanel], #0x0]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "ldr x19, [x26, #0x28]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "mov v7.d[1], x24\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "mov v0.d[1], x21\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + "ldr d1, [%x[Apanel], #0x10]\n" + "fmla v8.4s, v5.4s, v2.s[0]\n" + "ldr d4, [x26, #0x0]\n" + "fmla v11.4s, v5.4s, v2.s[1]\n" + "mov v1.d[1], x20\n" + "fmla v14.4s, v5.4s, v2.s[2]\n" + "mov v4.d[1], x23\n" + "fmla v17.4s, v5.4s, v2.s[3]\n" + "fmla v20.4s, v5.4s, v3.s[0]\n" + "fmla v23.4s, v5.4s, v3.s[1]\n" + "fmla v26.4s, v5.4s, v3.s[2]\n" + "fmla v29.4s, v5.4s, v3.s[3]\n" + "ldr d5, [x26, #0x10]\n" + "fmla v9.4s, v6.4s, v2.s[0]\n" + "mov v5.d[1], x22\n" + "fmla v12.4s, v6.4s, v2.s[1]\n" + "fmla v15.4s, v6.4s, v2.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v21.4s, v6.4s, v3.s[0]\n" + "fmla v24.4s, v6.4s, v3.s[1]\n" + "fmla v27.4s, v6.4s, v3.s[2]\n" + "fmla v30.4s, v6.4s, v3.s[3]\n" + "ldr d6, [x26, #0x20]\n" + "mov v6.d[1], x19\n" + "fmla v10.4s, v7.4s, v2.s[0]\n" + "fmla v13.4s, v7.4s, v2.s[1]\n" + "fmla v16.4s, v7.4s, v2.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v22.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v3.s[1]\n" + "fmla v28.4s, v7.4s, v3.s[2]\n" + "fmla v31.4s, v7.4s, v3.s[3]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "add x26, x26, #0x30\n" + "fmla v11.4s, v4.4s, v0.s[1]\n" + "fmla v14.4s, v4.4s, v0.s[2]\n" + "fmla v17.4s, v4.4s, v0.s[3]\n" + "fmla v20.4s, v4.4s, v1.s[0]\n" + "fmla v23.4s, v4.4s, v1.s[1]\n" + "fmla v26.4s, v4.4s, v1.s[2]\n" + "fmla v29.4s, v4.4s, v1.s[3]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v12.4s, v5.4s, v0.s[1]\n" + "fmla v15.4s, v5.4s, v0.s[2]\n" + "fmla v18.4s, v5.4s, v0.s[3]\n" + "fmla v21.4s, v5.4s, v1.s[0]\n" + "fmla v24.4s, v5.4s, v1.s[1]\n" + "fmla v27.4s, v5.4s, v1.s[2]\n" + "fmla v30.4s, v5.4s, v1.s[3]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v13.4s, v6.4s, v0.s[1]\n" + "fmla v16.4s, v6.4s, v0.s[2]\n" + "fmla v19.4s, v6.4s, v0.s[3]\n" + "fmla v22.4s, v6.4s, v1.s[0]\n" + "fmla v25.4s, v6.4s, v1.s[1]\n" + "fmla v28.4s, v6.4s, v1.s[2]\n" + "fmla v31.4s, v6.4s, v1.s[3]\n" + "cbz x25, 6f\n" + "5:" // odd loop + "ldr q0, [%x[Apanel], #0x0]\n" + "subs x25, x25, #0x1\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "ldr q7, [x26, #0x0]\n" + "fmla v8.4s, v7.4s, v0.s[0]\n" + "ldr q4, [x26, #0x10]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "ldr q5, [x26, #0x20]\n" + "fmla v14.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v0.s[3]\n" + "add x26, x26, #0x30\n" + "fmla v20.4s, v7.4s, v1.s[0]\n" + "fmla v23.4s, v7.4s, v1.s[1]\n" + "fmla v26.4s, v7.4s, v1.s[2]\n" + "fmla v29.4s, v7.4s, v1.s[3]\n" + "fmla v9.4s, v4.4s, v0.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[1]\n" + "fmla v15.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v0.s[3]\n" + "fmla v21.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v1.s[1]\n" + "fmla v27.4s, v4.4s, v1.s[2]\n" + "fmla v30.4s, v4.4s, v1.s[3]\n" + "fmla v10.4s, v5.4s, v0.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[1]\n" + "fmla v16.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v0.s[3]\n" + "fmla v22.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v1.s[1]\n" + "fmla v28.4s, v5.4s, v1.s[2]\n" + "fmla v31.4s, v5.4s, v1.s[3]\n" + "bne 5b\n" + "6:" // multiply loop done + "subs x28, x28, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp new file mode 100644 index 0000000000..06dc1534c1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +namespace arm_gemm { + +void a64_interleaved_fp32_mla_8x12( + const float *Apanel, const float *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const float *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x4\n" + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "prfm pldl1keep, [x20, #0xc0]\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x100]\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0xc0]\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x140]\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q4, [x20, #0x0]\n" + "ldr q5, [x20, #0x10]\n" + "ldr q6, [x20, #0x20]\n" + "blt 4f\n" + "3:" // main loop head + "fmla v8.4s, v4.4s, v0.s[0]\n" + "fmla v11.4s, v4.4s, v0.s[1]\n" + "ldr q2, [%x[Apanel], #0x20]\n" + "fmla v14.4s, v4.4s, v0.s[2]\n" + "fmla v17.4s, v4.4s, v0.s[3]\n" + "ldr q3, [%x[Apanel], #0x30]\n" + "fmla v20.4s, v4.4s, v1.s[0]\n" + "fmla v23.4s, v4.4s, v1.s[1]\n" + "ldr q7, [x20, #0x30]\n" + "fmla v26.4s, v4.4s, v1.s[2]\n" + "fmla v29.4s, v4.4s, v1.s[3]\n" + "ldr q4, [x20, #0x40]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v12.4s, v5.4s, v0.s[1]\n" + "sub x19, x19, #0x4\n" + "fmla v15.4s, v5.4s, v0.s[2]\n" + "fmla v18.4s, v5.4s, v0.s[3]\n" + "cmp x19, #0x4\n" + "fmla v21.4s, v5.4s, v1.s[0]\n" + "fmla v24.4s, v5.4s, v1.s[1]\n" + "prfm pldl1keep, [%x[Apanel], #0x100]\n" + "fmla v27.4s, v5.4s, v1.s[2]\n" + "fmla v30.4s, v5.4s, v1.s[3]\n" + "ldr q5, [x20, #0x50]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v13.4s, v6.4s, v0.s[1]\n" + "prfm pldl1keep, [x20, #0x180]\n" + "fmla v16.4s, v6.4s, v0.s[2]\n" + "fmla v19.4s, v6.4s, v0.s[3]\n" + "ldr q0, [%x[Apanel], #0x40]\n" + "fmla v22.4s, v6.4s, v1.s[0]\n" + "fmla v25.4s, v6.4s, v1.s[1]\n" + "prfm pldl1keep, [x20, #0x1c0]\n" + "fmla v28.4s, v6.4s, v1.s[2]\n" + "fmla v31.4s, v6.4s, v1.s[3]\n" + "ldr q1, [%x[Apanel], #0x50]\n" + "fmla v8.4s, v7.4s, v2.s[0]\n" + "fmla v11.4s, v7.4s, v2.s[1]\n" + "ldr q6, [x20, #0x60]\n" + "fmla v14.4s, v7.4s, v2.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "prfm pldl1keep, [%x[Apanel], #0x140]\n" + "fmla v20.4s, v7.4s, v3.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "prfm pldl1keep, [x20, #0x200]\n" + "fmla v26.4s, v7.4s, v3.s[2]\n" + "fmla v29.4s, v7.4s, v3.s[3]\n" + "ldr q7, [x20, #0x70]\n" + "fmla v9.4s, v4.4s, v2.s[0]\n" + "fmla v12.4s, v4.4s, v2.s[1]\n" + "fmla v15.4s, v4.4s, v2.s[2]\n" + "fmla v18.4s, v4.4s, v2.s[3]\n" + "fmla v21.4s, v4.4s, v3.s[0]\n" + "fmla v24.4s, v4.4s, v3.s[1]\n" + "fmla v27.4s, v4.4s, v3.s[2]\n" + "fmla v30.4s, v4.4s, v3.s[3]\n" + "ldr q4, [x20, #0x80]\n" + "fmla v10.4s, v5.4s, v2.s[0]\n" + "fmla v13.4s, v5.4s, v2.s[1]\n" + "fmla v16.4s, v5.4s, v2.s[2]\n" + "fmla v19.4s, v5.4s, v2.s[3]\n" + "ldr q2, [%x[Apanel], #0x60]\n" + "fmla v22.4s, v5.4s, v3.s[0]\n" + "fmla v25.4s, v5.4s, v3.s[1]\n" + "fmla v28.4s, v5.4s, v3.s[2]\n" + "fmla v31.4s, v5.4s, v3.s[3]\n" + "ldr q3, [%x[Apanel], #0x70]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v11.4s, v6.4s, v0.s[1]\n" + "ldr q5, [x20, #0x90]\n" + "fmla v14.4s, v6.4s, v0.s[2]\n" + "fmla v17.4s, v6.4s, v0.s[3]\n" + "add %x[Apanel], %x[Apanel], #0x80\n" + "fmla v20.4s, v6.4s, v1.s[0]\n" + "fmla v23.4s, v6.4s, v1.s[1]\n" + "fmla v26.4s, v6.4s, v1.s[2]\n" + "fmla v29.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x20, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v12.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v0.s[2]\n" + "fmla v18.4s, v7.4s, v0.s[3]\n" + "fmla v21.4s, v7.4s, v1.s[0]\n" + "fmla v24.4s, v7.4s, v1.s[1]\n" + "fmla v27.4s, v7.4s, v1.s[2]\n" + "fmla v30.4s, v7.4s, v1.s[3]\n" + "ldr q7, [x20, #0xb0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "add x20, x20, #0xc0\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "fmla v8.4s, v5.4s, v2.s[0]\n" + "fmla v11.4s, v5.4s, v2.s[1]\n" + "ldr q4, [x20, #0x0]\n" + "fmla v14.4s, v5.4s, v2.s[2]\n" + "fmla v17.4s, v5.4s, v2.s[3]\n" + "fmla v20.4s, v5.4s, v3.s[0]\n" + "fmla v23.4s, v5.4s, v3.s[1]\n" + "fmla v26.4s, v5.4s, v3.s[2]\n" + "fmla v29.4s, v5.4s, v3.s[3]\n" + "ldr q5, [x20, #0x10]\n" + "fmla v9.4s, v6.4s, v2.s[0]\n" + "fmla v12.4s, v6.4s, v2.s[1]\n" + "fmla v15.4s, v6.4s, v2.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v21.4s, v6.4s, v3.s[0]\n" + "fmla v24.4s, v6.4s, v3.s[1]\n" + "fmla v27.4s, v6.4s, v3.s[2]\n" + "fmla v30.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x20, #0x20]\n" + "fmla v10.4s, v7.4s, v2.s[0]\n" + "fmla v13.4s, v7.4s, v2.s[1]\n" + "fmla v16.4s, v7.4s, v2.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v22.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v3.s[1]\n" + "fmla v28.4s, v7.4s, v3.s[2]\n" + "fmla v31.4s, v7.4s, v3.s[3]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "fmla v11.4s, v4.4s, v0.s[1]\n" + "add x20, x20, #0x30\n" + "fmla v14.4s, v4.4s, v0.s[2]\n" + "fmla v17.4s, v4.4s, v0.s[3]\n" + "fmla v20.4s, v4.4s, v1.s[0]\n" + "fmla v23.4s, v4.4s, v1.s[1]\n" + "fmla v26.4s, v4.4s, v1.s[2]\n" + "fmla v29.4s, v4.4s, v1.s[3]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v12.4s, v5.4s, v0.s[1]\n" + "fmla v15.4s, v5.4s, v0.s[2]\n" + "fmla v18.4s, v5.4s, v0.s[3]\n" + "fmla v21.4s, v5.4s, v1.s[0]\n" + "fmla v24.4s, v5.4s, v1.s[1]\n" + "fmla v27.4s, v5.4s, v1.s[2]\n" + "fmla v30.4s, v5.4s, v1.s[3]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v13.4s, v6.4s, v0.s[1]\n" + "fmla v16.4s, v6.4s, v0.s[2]\n" + "fmla v19.4s, v6.4s, v0.s[3]\n" + "fmla v22.4s, v6.4s, v1.s[0]\n" + "fmla v25.4s, v6.4s, v1.s[1]\n" + "fmla v28.4s, v6.4s, v1.s[2]\n" + "fmla v31.4s, v6.4s, v1.s[3]\n" + "cbz x19, 6f\n" + "5:" // odd loop + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "subs x19, x19, #0x1\n" + "ldr q7, [x20, #0x0]\n" + "ldr q4, [x20, #0x10]\n" + "fmla v8.4s, v7.4s, v0.s[0]\n" + "ldr q5, [x20, #0x20]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v14.4s, v7.4s, v0.s[2]\n" + "fmla v17.4s, v7.4s, v0.s[3]\n" + "fmla v20.4s, v7.4s, v1.s[0]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla v23.4s, v7.4s, v1.s[1]\n" + "fmla v26.4s, v7.4s, v1.s[2]\n" + "add x20, x20, #0x30\n" + "fmla v29.4s, v7.4s, v1.s[3]\n" + "fmla v9.4s, v4.4s, v0.s[0]\n" + "fmla v12.4s, v4.4s, v0.s[1]\n" + "fmla v15.4s, v4.4s, v0.s[2]\n" + "fmla v18.4s, v4.4s, v0.s[3]\n" + "fmla v21.4s, v4.4s, v1.s[0]\n" + "fmla v24.4s, v4.4s, v1.s[1]\n" + "fmla v27.4s, v4.4s, v1.s[2]\n" + "fmla v30.4s, v4.4s, v1.s[3]\n" + "fmla v10.4s, v5.4s, v0.s[0]\n" + "fmla v13.4s, v5.4s, v0.s[1]\n" + "fmla v16.4s, v5.4s, v0.s[2]\n" + "fmla v19.4s, v5.4s, v0.s[3]\n" + "fmla v22.4s, v5.4s, v1.s[0]\n" + "fmla v25.4s, v5.4s, v1.s[1]\n" + "fmla v28.4s, v5.4s, v1.s[2]\n" + "fmla v31.4s, v5.4s, v1.s[3]\n" + "bne 5b\n" + "6:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp new file mode 100644 index 0000000000..8ba36cb87d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +namespace arm_gemm { + +void a64_interleaved_fp32_mla_8x12_x1( + const float *Apanel, const float *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const float *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x4\n" + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "prfm pldl1keep, [x20, #0xc0]\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x100]\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0xc0]\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "prfm pldl1keep, [x20, #0x140]\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q2, [x20, #0x0]\n" + "ldr q3, [x20, #0x10]\n" + "ldr q4, [x20, #0x20]\n" + "blt 4f\n" + "3:" // main loop head + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "sub x19, x19, #0x4\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "cmp x19, #0x4\n" + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "prfm pldl1keep, [%x[Apanel], #0x100]\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "ldr q2, [x20, #0x30]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "prfm pldl1keep, [x20, #0x180]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "prfm pldl1keep, [x20, #0x1c0]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "prfm pldl1keep, [%x[Apanel], #0x140]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "ldr q3, [x20, #0x40]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "prfm pldl1keep, [x20, #0x200]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "ldr q0, [%x[Apanel], #0x20]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + "ldr q1, [%x[Apanel], #0x30]\n" + "ldr q4, [x20, #0x50]\n" + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "ldr q2, [x20, #0x60]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "ldr q3, [x20, #0x70]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "ldr q0, [%x[Apanel], #0x40]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + "ldr q1, [%x[Apanel], #0x50]\n" + "ldr q4, [x20, #0x80]\n" + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "ldr q2, [x20, #0x90]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "ldr q3, [x20, #0xa0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "ldr q0, [%x[Apanel], #0x60]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + "ldr q1, [%x[Apanel], #0x70]\n" + "ldr q4, [x20, #0xb0]\n" + "add %x[Apanel], %x[Apanel], #0x80\n" + "add x20, x20, #0xc0\n" + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "ldr q2, [x20, #0x0]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "ldr q3, [x20, #0x10]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q4, [x20, #0x20]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "add x20, x20, #0x30\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + "cbz x19, 6f\n" + "5:" // odd loop + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "subs x19, x19, #0x1\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x20, #0x10]\n" + "fmla v8.4s, v5.4s, v0.s[0]\n" + "ldr q7, [x20, #0x20]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v14.4s, v5.4s, v0.s[2]\n" + "fmla v17.4s, v5.4s, v0.s[3]\n" + "fmla v20.4s, v5.4s, v1.s[0]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla v23.4s, v5.4s, v1.s[1]\n" + "fmla v26.4s, v5.4s, v1.s[2]\n" + "add x20, x20, #0x30\n" + "fmla v29.4s, v5.4s, v1.s[3]\n" + "fmla v9.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v15.4s, v6.4s, v0.s[2]\n" + "fmla v18.4s, v6.4s, v0.s[3]\n" + "fmla v21.4s, v6.4s, v1.s[0]\n" + "fmla v24.4s, v6.4s, v1.s[1]\n" + "fmla v27.4s, v6.4s, v1.s[2]\n" + "fmla v30.4s, v6.4s, v1.s[3]\n" + "fmla v10.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v16.4s, v7.4s, v0.s[2]\n" + "fmla v19.4s, v7.4s, v0.s[3]\n" + "fmla v22.4s, v7.4s, v1.s[0]\n" + "fmla v25.4s, v7.4s, v1.s[1]\n" + "fmla v28.4s, v7.4s, v1.s[2]\n" + "fmla v31.4s, v7.4s, v1.s[3]\n" + "bne 5b\n" + "6:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp new file mode 100644 index 0000000000..bc6b9931e1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +#include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + const int8_t *, const int8_t *, \ + int32_t *, int, int, int + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_interleaved_s8s32_dot_8x12( ARGLIST ); +void a64_interleaved_s8s32_dot_8x12_a55( ARGLIST ); +void a64_interleaved_s8s32_dot_8x12_x1( ARGLIST ); + +class cls_a64_interleaved_s8s32_dot_8x12 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return 12; + } + + static unsigned int stripe_width() + { + return 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 15.361, 0.9341, 0.1636 }; + default: + return { 29.0698, 3.9793, 0.4003 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_interleaved_s8s32_dot_8x12; + cls_a64_interleaved_s8s32_dot_8x12(const CPUInfo *ci) + { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_interleaved_s8s32_dot_8x12_a55; + break; + case CPUModel::X1: + kernel=a64_interleaved_s8s32_dot_8x12_x1; + break; + } + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp new file mode 100644 index 0000000000..3acd61c88c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include + +namespace arm_gemm { + +void a64_interleaved_s8s32_dot_8x12_a55( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x26, %x[Apanel]\n" + "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x24, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x26\n" + "cmp x24, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.4s, #0x0\n" + "prfm pldl1keep, [x25, #0x0]\n" + "movi v11.4s, #0x0\n" + "prfm pldl1keep, [x25, #0x40]\n" + "movi v12.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v13.4s, #0x0\n" + "prfm pldl1keep, [x25, #0x80]\n" + "movi v14.4s, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v15.4s, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "movi v16.4s, #0x0\n" + "ldr q4, [x25, #0x0]\n" + "movi v17.4s, #0x0\n" + "ldr q5, [x25, #0x10]\n" + "movi v18.4s, #0x0\n" + "ldr q6, [x25, #0x20]\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" + "ldr d2, [%x[Apanel], #0x20]\n" + "ldr x23, [%x[Apanel], #0x28]\n" + ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" + "ldr d3, [%x[Apanel], #0x30]\n" + ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" + "ldr x19, [%x[Apanel], #0x38]\n" + ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr x22, [x25, #0x38]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "ldr x20, [x25, #0x48]\n" + ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" + "ldr x21, [x25, #0x58]\n" + ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" + "ldr d4, [x25, #0x30]\n" + ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" + "mov v2.d[1], x23\n" + ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" + "mov v3.d[1], x19\n" + ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" + "mov v4.d[1], x22\n" + ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" + "prfm pldl1keep, [x25, #0x100]\n" + ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" + "prfm pldl1keep, [x25, #0x140]\n" + ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" + "ldr d5, [x25, #0x40]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "mov v5.d[1], x20\n" + ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" + "ldr x20, [%x[Apanel], #0x8]\n" + ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" + "ldr x19, [%x[Apanel], #0x18]\n" + ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" + "ldr d0, [%x[Apanel], #0x0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + "sub x24, x24, #0x2\n" + ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" + "cmp x24, #0x2\n" + ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" + "mov v0.d[1], x20\n" + ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" + "ldr d6, [x25, #0x50]\n" + "mov v6.d[1], x21\n" + "add x25, x25, #0x60\n" + ".inst 0x4f82e088 // sdot v8.4s, v4.16b, v2.4b[0]\n" + "ldr d1, [%x[Apanel], #0x10]\n" + ".inst 0x4fa2e08b // sdot v11.4s, v4.16b, v2.4b[1]\n" + "ldr x22, [x25, #0x8]\n" + ".inst 0x4f82e88e // sdot v14.4s, v4.16b, v2.4b[2]\n" + "ldr x20, [x25, #0x18]\n" + ".inst 0x4fa2e891 // sdot v17.4s, v4.16b, v2.4b[3]\n" + "ldr x21, [x25, #0x28]\n" + ".inst 0x4f83e094 // sdot v20.4s, v4.16b, v3.4b[0]\n" + "mov v1.d[1], x19\n" + ".inst 0x4fa3e097 // sdot v23.4s, v4.16b, v3.4b[1]\n" + ".inst 0x4f83e89a // sdot v26.4s, v4.16b, v3.4b[2]\n" + ".inst 0x4fa3e89d // sdot v29.4s, v4.16b, v3.4b[3]\n" + "ldr d4, [x25, #0x0]\n" + ".inst 0x4f82e0a9 // sdot v9.4s, v5.16b, v2.4b[0]\n" + "mov v4.d[1], x22\n" + ".inst 0x4fa2e0ac // sdot v12.4s, v5.16b, v2.4b[1]\n" + ".inst 0x4f82e8af // sdot v15.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4fa2e8b2 // sdot v18.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4f83e0b5 // sdot v21.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4fa3e0b8 // sdot v24.4s, v5.16b, v3.4b[1]\n" + ".inst 0x4f83e8bb // sdot v27.4s, v5.16b, v3.4b[2]\n" + ".inst 0x4fa3e8be // sdot v30.4s, v5.16b, v3.4b[3]\n" + "ldr d5, [x25, #0x10]\n" + ".inst 0x4f82e0ca // sdot v10.4s, v6.16b, v2.4b[0]\n" + "mov v5.d[1], x20\n" + ".inst 0x4fa2e0cd // sdot v13.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4fa2e8d3 // sdot v19.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4fa3e0d9 // sdot v25.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4f83e8dc // sdot v28.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n" + "ldr d6, [x25, #0x20]\n" + "mov v6.d[1], x21\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" + "add x25, x25, #0x30\n" + ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" + "cbz x24, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "ldr q7, [x25, #0x0]\n" + ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" + "ldr q4, [x25, #0x10]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q5, [x25, #0x20]\n" + ".inst 0x4f80e8ee // sdot v14.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4fa0e8f1 // sdot v17.4s, v7.16b, v0.4b[3]\n" + "add x25, x25, #0x30\n" + ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4fa1e0f7 // sdot v23.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4f81e8fa // sdot v26.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4fa1e8fd // sdot v29.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4fa0e08c // sdot v12.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f80e88f // sdot v15.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e098 // sdot v24.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f81e89b // sdot v27.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89e // sdot v30.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f80e0aa // sdot v10.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4fa0e0ad // sdot v13.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4fa1e0b9 // sdot v25.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4f81e8bc // sdot v28.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4fa1e8bf // sdot v31.4s, v5.16b, v1.4b[3]\n" + "5:" // multiply loop done + "subs x27, x27, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp new file mode 100644 index 0000000000..267f62ae8a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include + +namespace arm_gemm { + +void a64_interleaved_s8s32_dot_8x12( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "ldr q4, [x20, #0x0]\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "ldr q5, [x20, #0x10]\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "ldr q6, [x20, #0x20]\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" + "ldr q2, [%x[Apanel], #0x20]\n" + ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" + "ldr q3, [%x[Apanel], #0x30]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "sub x19, x19, #0x2\n" + ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" + "ldr q4, [x20, #0x30]\n" + ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" + "cmp x19, #0x2\n" + ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" + "ldr q5, [x20, #0x40]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" + "prfm pldl1keep, [x20, #0x100]\n" + ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" + "prfm pldl1keep, [x20, #0x140]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x20, #0x50]\n" + "add x20, x20, #0x60\n" + ".inst 0x4f82e088 // sdot v8.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4fa2e08b // sdot v11.4s, v4.16b, v2.4b[1]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + ".inst 0x4f82e88e // sdot v14.4s, v4.16b, v2.4b[2]\n" + ".inst 0x4fa2e891 // sdot v17.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4f83e094 // sdot v20.4s, v4.16b, v3.4b[0]\n" + ".inst 0x4fa3e097 // sdot v23.4s, v4.16b, v3.4b[1]\n" + ".inst 0x4f83e89a // sdot v26.4s, v4.16b, v3.4b[2]\n" + ".inst 0x4fa3e89d // sdot v29.4s, v4.16b, v3.4b[3]\n" + "ldr q4, [x20, #0x0]\n" + ".inst 0x4f82e0a9 // sdot v9.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4fa2e0ac // sdot v12.4s, v5.16b, v2.4b[1]\n" + ".inst 0x4f82e8af // sdot v15.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4fa2e8b2 // sdot v18.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4f83e0b5 // sdot v21.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4fa3e0b8 // sdot v24.4s, v5.16b, v3.4b[1]\n" + ".inst 0x4f83e8bb // sdot v27.4s, v5.16b, v3.4b[2]\n" + ".inst 0x4fa3e8be // sdot v30.4s, v5.16b, v3.4b[3]\n" + "ldr q5, [x20, #0x10]\n" + ".inst 0x4f82e0ca // sdot v10.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4fa2e0cd // sdot v13.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4fa2e8d3 // sdot v19.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4fa3e0d9 // sdot v25.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4f83e8dc // sdot v28.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x20, #0x20]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" + "add x20, x20, #0x30\n" + ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" + "cbz x19, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "ldr q7, [x20, #0x0]\n" + "ldr q4, [x20, #0x10]\n" + ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" + "ldr q5, [x20, #0x20]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4f80e8ee // sdot v14.4s, v7.16b, v0.4b[2]\n" + "add x20, x20, #0x30\n" + ".inst 0x4fa0e8f1 // sdot v17.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4fa1e0f7 // sdot v23.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4f81e8fa // sdot v26.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4fa1e8fd // sdot v29.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4fa0e08c // sdot v12.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f80e88f // sdot v15.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e098 // sdot v24.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f81e89b // sdot v27.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89e // sdot v30.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f80e0aa // sdot v10.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4fa0e0ad // sdot v13.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4fa1e0b9 // sdot v25.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4f81e8bc // sdot v28.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4fa1e8bf // sdot v31.4s, v5.16b, v1.4b[3]\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp new file mode 100644 index 0000000000..4804c059a3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include + +namespace arm_gemm { + +void a64_interleaved_s8s32_dot_8x12_x1( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "ldr q2, [x20, #0x0]\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "ldr q3, [x20, #0x10]\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "ldr q4, [x20, #0x20]\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n" + ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n" + "sub x19, x19, #0x2\n" + ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n" + ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n" + "cmp x19, #0x2\n" + ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n" + ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n" + ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n" + "ldr q2, [x20, #0x30]\n" + ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n" + ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n" + "prfm pldl1keep, [x20, #0x100]\n" + ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n" + ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n" + "prfm pldl1keep, [x20, #0x140]\n" + ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n" + ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n" + ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n" + ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n" + "ldr q3, [x20, #0x40]\n" + ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n" + "ldr q0, [%x[Apanel], #0x20]\n" + ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n" + "ldr q1, [%x[Apanel], #0x30]\n" + "ldr q4, [x20, #0x50]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + "add x20, x20, #0x60\n" + ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n" + ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n" + ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n" + ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n" + ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n" + ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n" + ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n" + ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n" + "ldr q2, [x20, #0x0]\n" + ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n" + ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n" + ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n" + ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n" + ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n" + ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n" + ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n" + ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n" + "ldr q3, [x20, #0x10]\n" + ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q4, [x20, #0x20]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n" + ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n" + "add x20, x20, #0x30\n" + ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n" + ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n" + ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n" + ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n" + ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n" + ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n" + ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n" + ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n" + ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n" + ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n" + ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n" + ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n" + ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n" + ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n" + ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n" + "cbz x19, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x20, #0x10]\n" + ".inst 0x4f80e0a8 // sdot v8.4s, v5.16b, v0.4b[0]\n" + "ldr q7, [x20, #0x20]\n" + ".inst 0x4fa0e0ab // sdot v11.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4f80e8ae // sdot v14.4s, v5.16b, v0.4b[2]\n" + "add x20, x20, #0x30\n" + ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4fa1e0b7 // sdot v23.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4f81e8ba // sdot v26.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4fa1e8bd // sdot v29.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4fa0e0cc // sdot v12.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4f80e8cf // sdot v15.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4fa1e0d8 // sdot v24.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4f81e8db // sdot v27.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4fa1e8de // sdot v30.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4fa0e0ed // sdot v13.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4fa1e0f9 // sdot v25.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4f81e8fc // sdot v28.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4fa1e8ff // sdot v31.4s, v7.16b, v1.4b[3]\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp index b17b76f170..ff69bc8f53 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,64 +10,103 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #pragma once #ifdef __aarch64__ - -#include #include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const int8_t *, const int8_t *, \ + int32_t *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int); +void a64_interleaved_s8s32_mmla_8x12( ARGLIST ); -class cls_a64_interleaved_s8s32_mmla_8x12 { +class cls_a64_interleaved_s8s32_mmla_8x12 +{ public: typedef int8_t operand_type; typedef int32_t result_type; - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return 12; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return 4; } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 8; } - // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=a64_interleaved_s8s32_mmla_8x12; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 62.57, 4.08, 8.01 }; + case CPUModel::A510: + return { 48.25, 3.53, 3.71 }; + case CPUModel::V1: + return { 117.02, 4.98, 10.87 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 62.53, 3.70, 0.50 }; + case CPUModel::A510: + return { 48.22, 2.49, 0.29 }; + case CPUModel::V1: + return { 116.76, 4.67, 0.60 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=a64_interleaved_s8s32_mmla_8x12; cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp index 2093e75b8e..0c2722a1c2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,373 +23,340 @@ */ #ifdef __aarch64__ +#include #include -#include "../../asmlib.hpp" namespace arm_gemm { -void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; +void a64_interleaved_s8s32_mmla_8x12( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; - for (int yb=0; yb transforms = {}; + StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 15.361, 0.9341, 0.1636 }; + default: + return { 29.0698, 3.9793, 0.4003 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=a64_interleaved_u8u32_dot_8x12; + cls_a64_interleaved_u8u32_dot_8x12(const CPUInfo *ci) + { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_interleaved_u8u32_dot_8x12_a55; + break; + case CPUModel::X1: + kernel=a64_interleaved_u8u32_dot_8x12_x1; + break; + } + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp new file mode 100644 index 0000000000..7892306153 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include + +namespace arm_gemm { + +void a64_interleaved_u8u32_dot_8x12_a55( + const uint8_t *Apanel, const uint8_t *Bpanel, + uint32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const uint8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x26, %x[Apanel]\n" + "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x24, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x26\n" + "cmp x24, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.4s, #0x0\n" + "prfm pldl1keep, [x25, #0x0]\n" + "movi v11.4s, #0x0\n" + "prfm pldl1keep, [x25, #0x40]\n" + "movi v12.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v13.4s, #0x0\n" + "prfm pldl1keep, [x25, #0x80]\n" + "movi v14.4s, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v15.4s, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "movi v16.4s, #0x0\n" + "ldr q4, [x25, #0x0]\n" + "movi v17.4s, #0x0\n" + "ldr q5, [x25, #0x10]\n" + "movi v18.4s, #0x0\n" + "ldr q6, [x25, #0x20]\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" + "ldr d2, [%x[Apanel], #0x20]\n" + "ldr x23, [%x[Apanel], #0x28]\n" + ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" + "ldr d3, [%x[Apanel], #0x30]\n" + ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" + "ldr x19, [%x[Apanel], #0x38]\n" + ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr x22, [x25, #0x38]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "ldr x20, [x25, #0x48]\n" + ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" + "ldr x21, [x25, #0x58]\n" + ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" + "ldr d4, [x25, #0x30]\n" + ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" + "mov v2.d[1], x23\n" + ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" + "mov v3.d[1], x19\n" + ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" + "mov v4.d[1], x22\n" + ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" + "prfm pldl1keep, [x25, #0x100]\n" + ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" + "prfm pldl1keep, [x25, #0x140]\n" + ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" + "ldr d5, [x25, #0x40]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "mov v5.d[1], x20\n" + ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" + "ldr x20, [%x[Apanel], #0x8]\n" + ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" + "ldr x19, [%x[Apanel], #0x18]\n" + ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" + "ldr d0, [%x[Apanel], #0x0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + "sub x24, x24, #0x2\n" + ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" + "cmp x24, #0x2\n" + ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" + "mov v0.d[1], x20\n" + ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" + "ldr d6, [x25, #0x50]\n" + "mov v6.d[1], x21\n" + "add x25, x25, #0x60\n" + ".inst 0x6f82e088 // udot v8.4s, v4.16b, v2.4b[0]\n" + "ldr d1, [%x[Apanel], #0x10]\n" + ".inst 0x6fa2e08b // udot v11.4s, v4.16b, v2.4b[1]\n" + "ldr x22, [x25, #0x8]\n" + ".inst 0x6f82e88e // udot v14.4s, v4.16b, v2.4b[2]\n" + "ldr x20, [x25, #0x18]\n" + ".inst 0x6fa2e891 // udot v17.4s, v4.16b, v2.4b[3]\n" + "ldr x21, [x25, #0x28]\n" + ".inst 0x6f83e094 // udot v20.4s, v4.16b, v3.4b[0]\n" + "mov v1.d[1], x19\n" + ".inst 0x6fa3e097 // udot v23.4s, v4.16b, v3.4b[1]\n" + ".inst 0x6f83e89a // udot v26.4s, v4.16b, v3.4b[2]\n" + ".inst 0x6fa3e89d // udot v29.4s, v4.16b, v3.4b[3]\n" + "ldr d4, [x25, #0x0]\n" + ".inst 0x6f82e0a9 // udot v9.4s, v5.16b, v2.4b[0]\n" + "mov v4.d[1], x22\n" + ".inst 0x6fa2e0ac // udot v12.4s, v5.16b, v2.4b[1]\n" + ".inst 0x6f82e8af // udot v15.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6fa2e8b2 // udot v18.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6f83e0b5 // udot v21.4s, v5.16b, v3.4b[0]\n" + ".inst 0x6fa3e0b8 // udot v24.4s, v5.16b, v3.4b[1]\n" + ".inst 0x6f83e8bb // udot v27.4s, v5.16b, v3.4b[2]\n" + ".inst 0x6fa3e8be // udot v30.4s, v5.16b, v3.4b[3]\n" + "ldr d5, [x25, #0x10]\n" + ".inst 0x6f82e0ca // udot v10.4s, v6.16b, v2.4b[0]\n" + "mov v5.d[1], x20\n" + ".inst 0x6fa2e0cd // udot v13.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6fa2e8d3 // udot v19.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6fa3e0d9 // udot v25.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6f83e8dc // udot v28.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n" + "ldr d6, [x25, #0x20]\n" + "mov v6.d[1], x21\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" + "add x25, x25, #0x30\n" + ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" + ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" + "cbz x24, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "ldr q7, [x25, #0x0]\n" + ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n" + "ldr q4, [x25, #0x10]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q5, [x25, #0x20]\n" + ".inst 0x6f80e8ee // udot v14.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6fa0e8f1 // udot v17.4s, v7.16b, v0.4b[3]\n" + "add x25, x25, #0x30\n" + ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6fa1e0f7 // udot v23.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6f81e8fa // udot v26.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6fa1e8fd // udot v29.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6f80e089 // udot v9.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6fa0e08c // udot v12.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6f80e88f // udot v15.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e098 // udot v24.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f81e89b // udot v27.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89e // udot v30.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6f80e0aa // udot v10.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6fa0e0ad // udot v13.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6fa1e0b9 // udot v25.4s, v5.16b, v1.4b[1]\n" + ".inst 0x6f81e8bc // udot v28.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6fa1e8bf // udot v31.4s, v5.16b, v1.4b[3]\n" + "5:" // multiply loop done + "subs x27, x27, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp new file mode 100644 index 0000000000..42226e90f5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include + +namespace arm_gemm { + +void a64_interleaved_u8u32_dot_8x12( + const uint8_t *Apanel, const uint8_t *Bpanel, + uint32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const uint8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "ldr q4, [x20, #0x0]\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "ldr q5, [x20, #0x10]\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "ldr q6, [x20, #0x20]\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" + "ldr q2, [%x[Apanel], #0x20]\n" + ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" + "ldr q3, [%x[Apanel], #0x30]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "sub x19, x19, #0x2\n" + ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" + "ldr q4, [x20, #0x30]\n" + ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" + "cmp x19, #0x2\n" + ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" + "ldr q5, [x20, #0x40]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" + "prfm pldl1keep, [x20, #0x100]\n" + ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" + "prfm pldl1keep, [x20, #0x140]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x20, #0x50]\n" + "add x20, x20, #0x60\n" + ".inst 0x6f82e088 // udot v8.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6fa2e08b // udot v11.4s, v4.16b, v2.4b[1]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + ".inst 0x6f82e88e // udot v14.4s, v4.16b, v2.4b[2]\n" + ".inst 0x6fa2e891 // udot v17.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6f83e094 // udot v20.4s, v4.16b, v3.4b[0]\n" + ".inst 0x6fa3e097 // udot v23.4s, v4.16b, v3.4b[1]\n" + ".inst 0x6f83e89a // udot v26.4s, v4.16b, v3.4b[2]\n" + ".inst 0x6fa3e89d // udot v29.4s, v4.16b, v3.4b[3]\n" + "ldr q4, [x20, #0x0]\n" + ".inst 0x6f82e0a9 // udot v9.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6fa2e0ac // udot v12.4s, v5.16b, v2.4b[1]\n" + ".inst 0x6f82e8af // udot v15.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6fa2e8b2 // udot v18.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6f83e0b5 // udot v21.4s, v5.16b, v3.4b[0]\n" + ".inst 0x6fa3e0b8 // udot v24.4s, v5.16b, v3.4b[1]\n" + ".inst 0x6f83e8bb // udot v27.4s, v5.16b, v3.4b[2]\n" + ".inst 0x6fa3e8be // udot v30.4s, v5.16b, v3.4b[3]\n" + "ldr q5, [x20, #0x10]\n" + ".inst 0x6f82e0ca // udot v10.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6fa2e0cd // udot v13.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6fa2e8d3 // udot v19.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6fa3e0d9 // udot v25.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6f83e8dc // udot v28.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x20, #0x20]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" + "add x20, x20, #0x30\n" + ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" + ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" + "cbz x19, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "ldr q7, [x20, #0x0]\n" + "ldr q4, [x20, #0x10]\n" + ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n" + "ldr q5, [x20, #0x20]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6f80e8ee // udot v14.4s, v7.16b, v0.4b[2]\n" + "add x20, x20, #0x30\n" + ".inst 0x6fa0e8f1 // udot v17.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6fa1e0f7 // udot v23.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6f81e8fa // udot v26.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6fa1e8fd // udot v29.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6f80e089 // udot v9.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6fa0e08c // udot v12.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6f80e88f // udot v15.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e098 // udot v24.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f81e89b // udot v27.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89e // udot v30.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6f80e0aa // udot v10.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6fa0e0ad // udot v13.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6fa1e0b9 // udot v25.4s, v5.16b, v1.4b[1]\n" + ".inst 0x6f81e8bc // udot v28.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6fa1e8bf // udot v31.4s, v5.16b, v1.4b[3]\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp new file mode 100644 index 0000000000..652f2bffc5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include + +namespace arm_gemm { + +void a64_interleaved_u8u32_dot_8x12_x1( + const uint8_t *Apanel, const uint8_t *Bpanel, + uint32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const uint8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x0]\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x0]\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x40]\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "prfm pldl1keep, [%x[Apanel], #0x40]\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "prfm pldl1keep, [x20, #0x80]\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "ldr q2, [x20, #0x0]\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "ldr q3, [x20, #0x10]\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "ldr q4, [x20, #0x20]\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" + ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n" + "sub x19, x19, #0x2\n" + ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n" + ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n" + "cmp x19, #0x2\n" + ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n" + ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n" + "prfm pldl1keep, [%x[Apanel], #0x80]\n" + ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n" + ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n" + "ldr q2, [x20, #0x30]\n" + ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n" + ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n" + "prfm pldl1keep, [x20, #0x100]\n" + ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n" + ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n" + "prfm pldl1keep, [x20, #0x140]\n" + ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n" + ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n" + ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n" + ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n" + "ldr q3, [x20, #0x40]\n" + ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n" + "ldr q0, [%x[Apanel], #0x20]\n" + ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" + "ldr q1, [%x[Apanel], #0x30]\n" + "ldr q4, [x20, #0x50]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + "add x20, x20, #0x60\n" + ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" + ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n" + ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n" + ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n" + ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n" + ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n" + ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n" + ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n" + "ldr q2, [x20, #0x0]\n" + ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n" + ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n" + ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n" + ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n" + ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n" + ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n" + ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n" + ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n" + "ldr q3, [x20, #0x10]\n" + ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n" + "ldr q0, [%x[Apanel], #0x0]\n" + ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q4, [x20, #0x20]\n" + "bge 3b\n" + "4:" // main loop skip + "add %x[Apanel], %x[Apanel], #0x20\n" + ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" + ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n" + "add x20, x20, #0x30\n" + ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n" + ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n" + ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n" + ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n" + ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n" + ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n" + ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n" + ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n" + ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n" + ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n" + ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n" + ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n" + ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n" + ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n" + ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" + "cbz x19, 5f\n" + "ldr q0, [%x[Apanel], #0x0]\n" + "ldr q1, [%x[Apanel], #0x10]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x20, #0x10]\n" + ".inst 0x6f80e0a8 // udot v8.4s, v5.16b, v0.4b[0]\n" + "ldr q7, [x20, #0x20]\n" + ".inst 0x6fa0e0ab // udot v11.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6f80e8ae // udot v14.4s, v5.16b, v0.4b[2]\n" + "add x20, x20, #0x30\n" + ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6fa1e0b7 // udot v23.4s, v5.16b, v1.4b[1]\n" + ".inst 0x6f81e8ba // udot v26.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6fa1e8bd // udot v29.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6fa0e0cc // udot v12.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6f80e8cf // udot v15.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6fa1e0d8 // udot v24.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6f81e8db // udot v27.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6fa1e8de // udot v30.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6fa0e0ed // udot v13.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6fa1e0f9 // udot v25.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6f81e8fc // udot v28.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6fa1e8ff // udot v31.4s, v7.16b, v1.4b[3]\n" + "5:" // multiply loop done + "subs x22, x22, #0x1\n" + "str q8, [%x[Cpanel], #0x0]\n" + "str q9, [%x[Cpanel], #0x10]\n" + "str q10, [%x[Cpanel], #0x20]\n" + "str q11, [%x[Cpanel], #0x30]\n" + "str q12, [%x[Cpanel], #0x40]\n" + "str q13, [%x[Cpanel], #0x50]\n" + "str q14, [%x[Cpanel], #0x60]\n" + "str q15, [%x[Cpanel], #0x70]\n" + "str q16, [%x[Cpanel], #0x80]\n" + "str q17, [%x[Cpanel], #0x90]\n" + "str q18, [%x[Cpanel], #0xa0]\n" + "str q19, [%x[Cpanel], #0xb0]\n" + "str q20, [%x[Cpanel], #0xc0]\n" + "str q21, [%x[Cpanel], #0xd0]\n" + "str q22, [%x[Cpanel], #0xe0]\n" + "str q23, [%x[Cpanel], #0xf0]\n" + "str q24, [%x[Cpanel], #0x100]\n" + "str q25, [%x[Cpanel], #0x110]\n" + "str q26, [%x[Cpanel], #0x120]\n" + "str q27, [%x[Cpanel], #0x130]\n" + "str q28, [%x[Cpanel], #0x140]\n" + "str q29, [%x[Cpanel], #0x150]\n" + "str q30, [%x[Cpanel], #0x160]\n" + "str q31, [%x[Cpanel], #0x170]\n" + "add %x[Cpanel], %x[Cpanel], #0x180\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp index 99dd0be0d9..f492a474ae 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,64 +10,103 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #pragma once #ifdef __aarch64__ - -#include #include "../std_transforms_fixed.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const uint8_t *, const uint8_t *, \ + uint32_t *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); +void a64_interleaved_u8u32_mmla_8x12( ARGLIST ); -class cls_a64_interleaved_u8u32_mmla_8x12 { +class cls_a64_interleaved_u8u32_mmla_8x12 +{ public: typedef uint8_t operand_type; typedef uint32_t result_type; - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return 12; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return 4; } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 8; } - // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; StdTransformsFixed transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=a64_interleaved_u8u32_mmla_8x12; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 62.58, 4.06, 8.02 }; + case CPUModel::A510: + return { 47.83, 3.59, 3.72 }; + case CPUModel::V1: + return { 111.52, 4.97, 10.80 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 62.57, 4.10, 0.51 }; + case CPUModel::A510: + return { 47.66, 2.47, 0.29 }; + case CPUModel::V1: + return { 111.60, 4.95, 0.66 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=a64_interleaved_u8u32_mmla_8x12; cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp index 238a703708..e67d17e49a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp @@ -23,395 +23,269 @@ */ #ifdef __aarch64__ +#include #include -#include "../../asmlib.hpp" namespace arm_gemm { -void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb transforms = {}; + template static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 3.954, 1.252, 1.141 }; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 3.954, 1.252, 1.141 }; - case CPUModel::A53: - return { 2.777, 0.987, 0.898 }; + case CPUModel::A53: + return { 2.777, 0.987, 0.898 }; - case CPUModel::A73: - return { 2.885, 1.429, 1.163 }; + case CPUModel::A73: + return { 2.885, 1.429, 1.163 }; - default: - return { 7.2307, 3.876, 2.932 }; + default: + return { 7.2307, 3.876, 2.932 }; + } + } + + if (std::is_same::value) { + switch(ci->get_cpu_model()) { + case CPUModel::A510: + return { 4.98, 2.27, 3.05 }; + + default: + return { 7.99, 5.06, 7.32 }; + } } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp index 2e87a47036..52548b462c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp @@ -24,7 +24,6 @@ #ifdef __aarch64__ #include -#include #include "arm_gemm.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp index ca4a44a2c7..deaef27ee9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp @@ -24,7 +24,6 @@ #ifdef __aarch64__ #include -#include #include "arm_gemm.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp deleted file mode 100644 index 57fd9c909e..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../performance_parameters.hpp" -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_gemv_fp32_mla_8VL(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool); - -class cls_sve_gemv_fp32_mla_8VL -{ -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool); - - static unsigned int out_width() - { - return 8 * get_vector_length(); - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_gemv_fp32_mla_8VL; - - cls_sve_gemv_fp32_mla_8VL(const CPUInfo *) - { - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp deleted file mode 100644 index 78387de90c..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp +++ /dev/null @@ -1,1372 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef ARM_COMPUTE_ENABLE_SVE - -#include "arm_gemm.hpp" -#include "../../utils.hpp" - -#include - -namespace arm_gemm { - -void sve_gemv_fp32_mla_8VL ( - const float *A_ptr, const float *B_ptr, float *output_ptr, - size_t N, size_t K, - const float *bias, Activation act, bool -) -{ - struct KernelArgs { - float maxval = static_cast(std::numeric_limits::infinity()); - float minval = - static_cast(std::numeric_limits::infinity()); - const float *B_ptr = {}; - size_t output_offset = {}; - unsigned int input_initial_col = {}; - } ka; - - unsigned long flags=0; - ka.B_ptr = B_ptr; - switch(act.type) { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - ka.maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - ka.minval = 0; - flags |= 0x2; - break; - } - __asm__ __volatile__( - "ptrue p2.b\n" - "cntw x24\n" - "add x23, %x[N], x24\n" - "sub x23, x23, #0x1\n" - "udiv x23, x23, x24\n" - "mov x22, %x[bias]\n" - "1:" // Column loop - "cmp x23, #0x8\n" - "bge 50f\n" - "cmp x23, #0x6\n" - "bgt 43f\n" - "beq 36f\n" - "cmp x23, #0x4\n" - "bgt 29f\n" - "beq 22f\n" - "cmp x23, #0x2\n" - "bgt 15f\n" - "beq 8f\n" - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "whilelt p1.s, XZR, %x[N]\n" - "cbz x22, 2f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "addvl x22, x22, #1\n" - "b 3f\n" - "2:" // Width 1: no bias - "mov z24.b, #0x0\n" - "3:" // Width 1: setup done - "cmp x21, #0x4\n" - "ble 5f\n" - "4:" // Width 1: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x20, x20, #0x10\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x21, x21, #0x4\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z2.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "cmp x21, #0x4\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z3.s, z0.s[2]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z4.s, z0.s[3]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "bgt 4b\n" - "5:" // Width 1: Multiply loop: Single iteration only - "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z5.s, z0.s[0]\n" - "add x20, x20, #0x10\n" - "subs x21, x21, #0x1\n" - "ble 6f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z6.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "ble 6f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z7.s, z0.s[2]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "ble 6f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z8.s, z0.s[3]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "6:" // Width 1: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 7f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "7:" // Width 1: No activation - "st1w { z24.s }, p1, [%x[output_ptr]]\n" - "addvl %x[output_ptr], %x[output_ptr], #1\n" - "b 57f\n" - "8:" // Width 2 - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "sub x19, %x[N], x24\n" - "whilelt p1.s, XZR, x19\n" - "cbz x22, 9f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "addvl x22, x22, #2\n" - "b 10f\n" - "9:" // Width 2: no bias - "mov z24.b, #0x0\n" - "mov z25.b, #0x0\n" - "10:" // Width 2: setup done - "cmp x21, #0x4\n" - "ble 12f\n" - "11:" // Width 2: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z2.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "sub x21, x21, #0x4\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z3.s, z0.s[1]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z25.s, z4.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "cmp x21, #0x4\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z5.s, z0.s[2]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z25.s, z6.s, z0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z7.s, z0.s[3]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z25.s, z8.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "bgt 11b\n" - "12:" // Width 2: Multiply loop: Single iteration only - "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z9.s, z0.s[0]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z10.s, z0.s[0]\n" - "subs x21, x21, #0x1\n" - "ble 13f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z11.s, z0.s[1]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z25.s, z12.s, z0.s[1]\n" - "ble 13f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z13.s, z0.s[2]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z25.s, z14.s, z0.s[2]\n" - "ble 13f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z25.s, z16.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "13:" // Width 2: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 14f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmin z25.s, p2/M, z25.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "fmax z25.s, p2/M, z25.s, z17.s\n" - "14:" // Width 2: No activation - "st1w { z24.s }, p2, [%x[output_ptr]]\n" - "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n" - "addvl %x[output_ptr], %x[output_ptr], #2\n" - "b 57f\n" - "15:" // Width 3 - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "mov x19, #0x2\n" - "msub x19, x24, x19, %x[N]\n" - "whilelt p1.s, XZR, x19\n" - "cbz x22, 16f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "addvl x22, x22, #3\n" - "b 17f\n" - "16:" // Width 3: no bias - "mov z24.b, #0x0\n" - "mov z25.b, #0x0\n" - "mov z26.b, #0x0\n" - "17:" // Width 3: setup done - "cmp x21, #0x4\n" - "ble 19f\n" - "18:" // Width 3: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "sub x21, x21, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z2.s, z0.s[0]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z26.s, z3.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "cmp x21, #0x4\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z4.s, z0.s[1]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z5.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z26.s, z6.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z7.s, z0.s[2]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z8.s, z0.s[2]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z26.s, z9.s, z0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z10.s, z0.s[3]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z11.s, z0.s[3]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z26.s, z12.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "bgt 18b\n" - "19:" // Width 3: Multiply loop: Single iteration only - "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "subs x21, x21, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z13.s, z0.s[0]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z14.s, z0.s[0]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z15.s, z0.s[0]\n" - "ble 20f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z16.s, z0.s[1]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z17.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z18.s, z0.s[1]\n" - "ble 20f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z19.s, z0.s[2]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z20.s, z0.s[2]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z21.s, z0.s[2]\n" - "ble 20f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z22.s, z0.s[3]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z23.s, z0.s[3]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z26.s, z1.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "20:" // Width 3: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 21f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmin z25.s, p2/M, z25.s, z16.s\n" - "fmin z26.s, p2/M, z26.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "fmax z25.s, p2/M, z25.s, z17.s\n" - "fmax z26.s, p2/M, z26.s, z17.s\n" - "21:" // Width 3: No activation - "st1w { z24.s }, p2, [%x[output_ptr]]\n" - "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" - "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n" - "addvl %x[output_ptr], %x[output_ptr], #3\n" - "b 57f\n" - "22:" // Width 4 - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "mov x19, #0x3\n" - "msub x19, x24, x19, %x[N]\n" - "whilelt p1.s, XZR, x19\n" - "cbz x22, 23f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" - "addvl x22, x22, #4\n" - "b 24f\n" - "23:" // Width 4: no bias - "mov z24.b, #0x0\n" - "mov z25.b, #0x0\n" - "mov z26.b, #0x0\n" - "mov z27.b, #0x0\n" - "24:" // Width 4: setup done - "cmp x21, #0x4\n" - "ble 26f\n" - "25:" // Width 4: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "sub x21, x21, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z2.s, z0.s[0]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z3.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "cmp x21, #0x4\n" - "fmla z27.s, z4.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z5.s, z0.s[1]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z6.s, z0.s[1]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z7.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z27.s, z8.s, z0.s[1]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "fmla z24.s, z9.s, z0.s[2]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z10.s, z0.s[2]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z11.s, z0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z27.s, z12.s, z0.s[2]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z13.s, z0.s[3]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z14.s, z0.s[3]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z15.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z27.s, z16.s, z0.s[3]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "bgt 25b\n" - "26:" // Width 4: Multiply loop: Single iteration only - "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "subs x21, x21, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z17.s, z0.s[0]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z18.s, z0.s[0]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z19.s, z0.s[0]\n" - "fmla z27.s, z20.s, z0.s[0]\n" - "ble 27f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z22.s, z0.s[1]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z23.s, z0.s[1]\n" - "fmla z27.s, z1.s, z0.s[1]\n" - "ble 27f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z2.s, z0.s[2]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z3.s, z0.s[2]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z4.s, z0.s[2]\n" - "fmla z27.s, z5.s, z0.s[2]\n" - "ble 27f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z7.s, z0.s[3]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z26.s, z8.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z27.s, z9.s, z0.s[3]\n" - "27:" // Width 4: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 28f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmin z25.s, p2/M, z25.s, z16.s\n" - "fmin z26.s, p2/M, z26.s, z16.s\n" - "fmin z27.s, p2/M, z27.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "fmax z25.s, p2/M, z25.s, z17.s\n" - "fmax z26.s, p2/M, z26.s, z17.s\n" - "fmax z27.s, p2/M, z27.s, z17.s\n" - "28:" // Width 4: No activation - "st1w { z24.s }, p2, [%x[output_ptr]]\n" - "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" - "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" - "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n" - "addvl %x[output_ptr], %x[output_ptr], #4\n" - "b 57f\n" - "29:" // Width 5 - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "mov x19, #0x4\n" - "msub x19, x24, x19, %x[N]\n" - "whilelt p1.s, XZR, x19\n" - "cbz x22, 30f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" - "addvl x22, x22, #5\n" - "b 31f\n" - "30:" // Width 5: no bias - "mov z24.b, #0x0\n" - "mov z25.b, #0x0\n" - "mov z26.b, #0x0\n" - "mov z27.b, #0x0\n" - "mov z28.b, #0x0\n" - "31:" // Width 5: setup done - "cmp x21, #0x4\n" - "ble 33f\n" - "32:" // Width 5: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "sub x21, x21, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z2.s, z0.s[0]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "cmp x21, #0x4\n" - "fmla z26.s, z3.s, z0.s[0]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z4.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z28.s, z5.s, z0.s[0]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "fmla z24.s, z6.s, z0.s[1]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z7.s, z0.s[1]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z8.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z27.s, z9.s, z0.s[1]\n" - "fmla z28.s, z10.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z11.s, z0.s[2]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z12.s, z0.s[2]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z13.s, z0.s[2]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z14.s, z0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z28.s, z15.s, z0.s[2]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z16.s, z0.s[3]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z17.s, z0.s[3]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z18.s, z0.s[3]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z19.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z28.s, z20.s, z0.s[3]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "bgt 32b\n" - "33:" // Width 5: Multiply loop: Single iteration only - "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "subs x21, x21, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z21.s, z0.s[0]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z22.s, z0.s[0]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z23.s, z0.s[0]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z1.s, z0.s[0]\n" - "fmla z28.s, z2.s, z0.s[0]\n" - "ble 34f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z3.s, z0.s[1]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z4.s, z0.s[1]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z5.s, z0.s[1]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z6.s, z0.s[1]\n" - "fmla z28.s, z7.s, z0.s[1]\n" - "ble 34f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z8.s, z0.s[2]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z9.s, z0.s[2]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z10.s, z0.s[2]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z11.s, z0.s[2]\n" - "fmla z28.s, z12.s, z0.s[2]\n" - "ble 34f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z13.s, z0.s[3]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z14.s, z0.s[3]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z15.s, z0.s[3]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z16.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z28.s, z17.s, z0.s[3]\n" - "34:" // Width 5: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 35f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmin z25.s, p2/M, z25.s, z16.s\n" - "fmin z26.s, p2/M, z26.s, z16.s\n" - "fmin z27.s, p2/M, z27.s, z16.s\n" - "fmin z28.s, p2/M, z28.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "fmax z25.s, p2/M, z25.s, z17.s\n" - "fmax z26.s, p2/M, z26.s, z17.s\n" - "fmax z27.s, p2/M, z27.s, z17.s\n" - "fmax z28.s, p2/M, z28.s, z17.s\n" - "35:" // Width 5: No activation - "st1w { z24.s }, p2, [%x[output_ptr]]\n" - "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" - "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" - "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" - "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n" - "addvl %x[output_ptr], %x[output_ptr], #5\n" - "b 57f\n" - "36:" // Width 6 - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "mov x19, #0x5\n" - "msub x19, x24, x19, %x[N]\n" - "whilelt p1.s, XZR, x19\n" - "cbz x22, 37f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" - "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n" - "addvl x22, x22, #6\n" - "b 38f\n" - "37:" // Width 6: no bias - "mov z24.b, #0x0\n" - "mov z25.b, #0x0\n" - "mov z26.b, #0x0\n" - "mov z27.b, #0x0\n" - "mov z28.b, #0x0\n" - "mov z29.b, #0x0\n" - "38:" // Width 6: setup done - "cmp x21, #0x4\n" - "ble 40f\n" - "39:" // Width 6: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "sub x21, x21, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z2.s, z0.s[0]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "cmp x21, #0x4\n" - "fmla z26.s, z3.s, z0.s[0]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z4.s, z0.s[0]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z5.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z29.s, z6.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z7.s, z0.s[1]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z8.s, z0.s[1]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z9.s, z0.s[1]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z10.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z28.s, z11.s, z0.s[1]\n" - "fmla z29.s, z12.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z13.s, z0.s[2]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z14.s, z0.s[2]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z15.s, z0.s[2]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z16.s, z0.s[2]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z28.s, z17.s, z0.s[2]\n" - "fmla z29.s, z18.s, z0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z20.s, z0.s[3]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z21.s, z0.s[3]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z22.s, z0.s[3]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z28.s, z23.s, z0.s[3]\n" - "fmla z29.s, z1.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "bgt 39b\n" - "40:" // Width 6: Multiply loop: Single iteration only - "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "subs x21, x21, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z2.s, z0.s[0]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z3.s, z0.s[0]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z4.s, z0.s[0]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z27.s, z5.s, z0.s[0]\n" - "fmla z28.s, z6.s, z0.s[0]\n" - "fmla z29.s, z7.s, z0.s[0]\n" - "ble 41f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z8.s, z0.s[1]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z9.s, z0.s[1]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z10.s, z0.s[1]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z11.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z12.s, z0.s[1]\n" - "fmla z29.s, z13.s, z0.s[1]\n" - "ble 41f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z15.s, z0.s[2]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z16.s, z0.s[2]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z17.s, z0.s[2]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z18.s, z0.s[2]\n" - "fmla z29.s, z19.s, z0.s[2]\n" - "ble 41f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z20.s, z0.s[3]\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z21.s, z0.s[3]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z22.s, z0.s[3]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z23.s, z0.s[3]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z28.s, z1.s, z0.s[3]\n" - "fmla z29.s, z2.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "41:" // Width 6: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 42f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmin z25.s, p2/M, z25.s, z16.s\n" - "fmin z26.s, p2/M, z26.s, z16.s\n" - "fmin z27.s, p2/M, z27.s, z16.s\n" - "fmin z28.s, p2/M, z28.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "fmax z25.s, p2/M, z25.s, z17.s\n" - "fmax z26.s, p2/M, z26.s, z17.s\n" - "fmax z27.s, p2/M, z27.s, z17.s\n" - "fmax z28.s, p2/M, z28.s, z17.s\n" - "fmin z29.s, p2/M, z29.s, z16.s\n" - "fmax z29.s, p2/M, z29.s, z17.s\n" - "42:" // Width 6: No activation - "st1w { z24.s }, p2, [%x[output_ptr]]\n" - "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" - "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" - "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" - "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n" - "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n" - "addvl %x[output_ptr], %x[output_ptr], #6\n" - "b 57f\n" - "43:" // Width 7 - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "mov x19, #0x6\n" - "msub x19, x24, x19, %x[N]\n" - "whilelt p1.s, XZR, x19\n" - "cbz x22, 44f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" - "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n" - "addvl x22, x22, #7\n" - "b 45f\n" - "44:" // Width 7: no bias - "mov z24.b, #0x0\n" - "mov z25.b, #0x0\n" - "mov z26.b, #0x0\n" - "mov z27.b, #0x0\n" - "mov z28.b, #0x0\n" - "mov z29.b, #0x0\n" - "mov z30.b, #0x0\n" - "45:" // Width 7: setup done - "cmp x21, #0x4\n" - "ble 47f\n" - "46:" // Width 7: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "sub x21, x21, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z2.s, z0.s[0]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "cmp x21, #0x4\n" - "fmla z26.s, z3.s, z0.s[0]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z4.s, z0.s[0]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "fmla z28.s, z5.s, z0.s[0]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z29.s, z6.s, z0.s[0]\n" - "fmla z30.s, z7.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z8.s, z0.s[1]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z9.s, z0.s[1]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z10.s, z0.s[1]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z11.s, z0.s[1]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z12.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z29.s, z13.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z30.s, z14.s, z0.s[1]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "fmla z24.s, z15.s, z0.s[2]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z25.s, z16.s, z0.s[2]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z17.s, z0.s[2]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z18.s, z0.s[2]\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z19.s, z0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z29.s, z20.s, z0.s[2]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z30.s, z21.s, z0.s[2]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[3]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z25.s, z23.s, z0.s[3]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z1.s, z0.s[3]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z2.s, z0.s[3]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z3.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z29.s, z4.s, z0.s[3]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "fmla z30.s, z5.s, z0.s[3]\n" - "bgt 46b\n" - "47:" // Width 7: Multiply loop: Single iteration only - "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "subs x21, x21, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z6.s, z0.s[0]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z7.s, z0.s[0]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z8.s, z0.s[0]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "fmla z27.s, z9.s, z0.s[0]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z10.s, z0.s[0]\n" - "fmla z29.s, z11.s, z0.s[0]\n" - "fmla z30.s, z12.s, z0.s[0]\n" - "ble 48f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z14.s, z0.s[1]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z15.s, z0.s[1]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z16.s, z0.s[1]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z17.s, z0.s[1]\n" - "fmla z29.s, z18.s, z0.s[1]\n" - "fmla z30.s, z19.s, z0.s[1]\n" - "ble 48f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z20.s, z0.s[2]\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z21.s, z0.s[2]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z22.s, z0.s[2]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z23.s, z0.s[2]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z1.s, z0.s[2]\n" - "fmla z29.s, z2.s, z0.s[2]\n" - "fmla z30.s, z3.s, z0.s[2]\n" - "ble 48f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z4.s, z0.s[3]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z5.s, z0.s[3]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z6.s, z0.s[3]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z7.s, z0.s[3]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z28.s, z8.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z29.s, z9.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z30.s, z10.s, z0.s[3]\n" - "48:" // Width 7: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 49f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmin z25.s, p2/M, z25.s, z16.s\n" - "fmin z26.s, p2/M, z26.s, z16.s\n" - "fmin z27.s, p2/M, z27.s, z16.s\n" - "fmin z28.s, p2/M, z28.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "fmax z25.s, p2/M, z25.s, z17.s\n" - "fmax z26.s, p2/M, z26.s, z17.s\n" - "fmax z27.s, p2/M, z27.s, z17.s\n" - "fmax z28.s, p2/M, z28.s, z17.s\n" - "fmin z29.s, p2/M, z29.s, z16.s\n" - "fmin z30.s, p2/M, z30.s, z16.s\n" - "fmax z29.s, p2/M, z29.s, z17.s\n" - "fmax z30.s, p2/M, z30.s, z17.s\n" - "49:" // Width 7: No activation - "st1w { z24.s }, p2, [%x[output_ptr]]\n" - "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" - "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" - "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" - "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n" - "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n" - "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n" - "addvl %x[output_ptr], %x[output_ptr], #7\n" - "b 57f\n" - "50:" // Width 8 - "mov x21, %x[K]\n" - "mov x20, %x[A_ptr]\n" - "mov x19, #0x7\n" - "msub x19, x24, x19, %x[N]\n" - "whilelt p1.s, XZR, x19\n" - "cbz x22, 51f\n" - "ld1w { z24.s }, p2/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" - "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n" - "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n" - "addvl x22, x22, #8\n" - "b 52f\n" - "51:" // Width 8: no bias - "mov z24.b, #0x0\n" - "mov z25.b, #0x0\n" - "mov z26.b, #0x0\n" - "mov z27.b, #0x0\n" - "mov z28.b, #0x0\n" - "mov z29.b, #0x0\n" - "mov z30.b, #0x0\n" - "mov z31.b, #0x0\n" - "52:" // Width 8: setup done - "cmp x21, #0x4\n" - "ble 54f\n" - "53:" // Width 8: Multiply loop: Main loop head - "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "sub x21, x21, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z1.s, z0.s[0]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z2.s, z0.s[0]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "cmp x21, #0x4\n" - "fmla z26.s, z3.s, z0.s[0]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z4.s, z0.s[0]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "fmla z28.s, z5.s, z0.s[0]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z6.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "fmla z30.s, z7.s, z0.s[0]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z31.s, z8.s, z0.s[0]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z25.s, z10.s, z0.s[1]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z11.s, z0.s[1]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z12.s, z0.s[1]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "fmla z28.s, z13.s, z0.s[1]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z14.s, z0.s[1]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z30.s, z15.s, z0.s[1]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z31.s, z16.s, z0.s[1]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "fmla z24.s, z17.s, z0.s[2]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z25.s, z18.s, z0.s[2]\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z19.s, z0.s[2]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z20.s, z0.s[2]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "fmla z28.s, z21.s, z0.s[2]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z22.s, z0.s[2]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z30.s, z23.s, z0.s[2]\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z31.s, z1.s, z0.s[2]\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "fmla z24.s, z2.s, z0.s[3]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z25.s, z3.s, z0.s[3]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z4.s, z0.s[3]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z5.s, z0.s[3]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "fmla z28.s, z6.s, z0.s[3]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z7.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z30.s, z8.s, z0.s[3]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "fmla z31.s, z9.s, z0.s[3]\n" - "bgt 53b\n" - "54:" // Width 8: Multiply loop: Single iteration only - "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n" - "whilelt p0.s, XZR, x21\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "subs x21, x21, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x20]\n" - "fmla z24.s, z10.s, z0.s[0]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "add x20, x20, #0x10\n" - "fmla z25.s, z11.s, z0.s[0]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "fmla z26.s, z12.s, z0.s[0]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "fmla z27.s, z13.s, z0.s[0]\n" - "fmla z28.s, z14.s, z0.s[0]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z15.s, z0.s[0]\n" - "fmla z30.s, z16.s, z0.s[0]\n" - "fmla z31.s, z17.s, z0.s[0]\n" - "ble 55f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z18.s, z0.s[1]\n" - "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z19.s, z0.s[1]\n" - "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z20.s, z0.s[1]\n" - "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z21.s, z0.s[1]\n" - "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "fmla z28.s, z22.s, z0.s[1]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z23.s, z0.s[1]\n" - "fmla z30.s, z1.s, z0.s[1]\n" - "fmla z31.s, z2.s, z0.s[1]\n" - "ble 55f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "subs x21, x21, #0x1\n" - "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z3.s, z0.s[2]\n" - "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z4.s, z0.s[2]\n" - "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z5.s, z0.s[2]\n" - "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z6.s, z0.s[2]\n" - "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "fmla z28.s, z7.s, z0.s[2]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z8.s, z0.s[2]\n" - "fmla z30.s, z9.s, z0.s[2]\n" - "fmla z31.s, z10.s, z0.s[2]\n" - "ble 55f\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" - "fmla z25.s, z12.s, z0.s[3]\n" - "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" - "fmla z26.s, z13.s, z0.s[3]\n" - "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" - "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" - "fmla z27.s, z14.s, z0.s[3]\n" - "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" - "fmla z28.s, z15.s, z0.s[3]\n" - "addvl %x[B_ptr], %x[B_ptr], #8\n" - "fmla z29.s, z16.s, z0.s[3]\n" - "prfm pldl1keep, [%x[B_ptr], #0x400]\n" - "prfm pldl1keep, [%x[B_ptr], #0x440]\n" - "fmla z30.s, z17.s, z0.s[3]\n" - "fmla z31.s, z18.s, z0.s[3]\n" - "55:" // Width 8: Multiply loop: multiply skip - "prfm pldl1keep, [x20, #0x80]\n" - "prfm pstl1keep, [%x[output_ptr], #0x0]\n" - "tbz %x[flags], #1, 56f\n" - "add x19, %x[args_ptr], %[offset_min]\n" - "ld1rw { z17.s }, p2/Z, [x19]\n" - "add x19, %x[args_ptr], %[offset_max]\n" - "ld1rw { z16.s }, p2/Z, [x19]\n" - "fmin z24.s, p2/M, z24.s, z16.s\n" - "fmin z25.s, p2/M, z25.s, z16.s\n" - "fmin z26.s, p2/M, z26.s, z16.s\n" - "fmin z27.s, p2/M, z27.s, z16.s\n" - "fmin z28.s, p2/M, z28.s, z16.s\n" - "fmax z24.s, p2/M, z24.s, z17.s\n" - "fmax z25.s, p2/M, z25.s, z17.s\n" - "fmax z26.s, p2/M, z26.s, z17.s\n" - "fmax z27.s, p2/M, z27.s, z17.s\n" - "fmax z28.s, p2/M, z28.s, z17.s\n" - "fmin z29.s, p2/M, z29.s, z16.s\n" - "fmin z30.s, p2/M, z30.s, z16.s\n" - "fmin z31.s, p2/M, z31.s, z16.s\n" - "fmax z29.s, p2/M, z29.s, z17.s\n" - "fmax z30.s, p2/M, z30.s, z17.s\n" - "fmax z31.s, p2/M, z31.s, z17.s\n" - "56:" // Width 8: No activation - "st1w { z24.s }, p2, [%x[output_ptr]]\n" - "subs x23, x23, #0x8\n" - "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" - "sub %x[N], %x[N], x24, LSL #3\n" - "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" - "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" - "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n" - "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n" - "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n" - "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n" - "addvl %x[output_ptr], %x[output_ptr], #8\n" - "bgt 1b\n" - "57:" // Exit - - : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr) - : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)) - : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" - ); -} - -} // namespace arm_gemm - -#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp index 7b0282fa32..6677c23216 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp @@ -22,10 +22,11 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" #include "../bfloat.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -43,7 +44,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST ); class cls_sve_hybrid_bf16fp32_dot_6x4VL { public: - typedef bfloat16 operand_type; + typedef bfloat16 lhs_operand_type; + typedef bfloat16 rhs_operand_type; typedef float result_type; typedef void (*kern_type)( ARGLIST ); @@ -69,7 +71,24 @@ public: return true; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 15.83 }; + case CPUModel::A510: + return { 6.80 }; + case CPUModel::V1: + return { 31.55 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL; @@ -81,4 +100,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp index 34a657f64f..b794c21807 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp @@ -162,13 +162,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "cmp x26, #0x8\n" + "add x25, x25, #0x10\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" @@ -203,7 +202,6 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -242,9 +240,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 6b\n" "tbz %x[flags], #1, 12f\n" @@ -348,16 +345,14 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x8\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" @@ -408,9 +403,7 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" @@ -465,10 +458,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" "24:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 19b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -602,21 +593,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x8\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" @@ -681,12 +669,9 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" @@ -756,11 +741,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" "37:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -923,26 +905,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" + "add x22, x22, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" @@ -1021,19 +999,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "add x22, x22, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -1114,12 +1088,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" "50:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 45b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1311,32 +1281,27 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" "add x21, x21, #0x10\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" @@ -1428,22 +1393,17 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "add x21, x21, #0x10\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" @@ -1539,13 +1499,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" "63:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 58b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1769,37 +1724,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" "add x20, x20, #0x10\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x20, #0x80]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" @@ -1905,25 +1854,19 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "add x20, x20, #0x10\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" @@ -2034,14 +1977,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" "76:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 71b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -2153,4 +2090,4 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp new file mode 100644 index 0000000000..b8d237ff23 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_bf16fp32_mmla_6x4VL( ARGLIST ); + +class cls_sve_hybrid_bf16fp32_mmla_6x4VL +{ +public: + typedef bfloat16 lhs_operand_type; + typedef bfloat16 rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 24.74 }; + case CPUModel::A510: + return { 6.74 }; + case CPUModel::V1: + return { 53.59 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_bf16fp32_mmla_6x4VL; + cls_sve_hybrid_bf16fp32_mmla_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp new file mode 100644 index 0000000000..e69293e3f1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp @@ -0,0 +1,2045 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_bf16fp32_mmla_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 66f\n" + "cmp %x[M], #0x4\n" + "bgt 53f\n" + "beq 40f\n" + "cmp %x[M], #0x2\n" + "bgt 27f\n" + "beq 14f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x11\n" + "incw x19\n" + "whilelt p3.s, x19, x11\n" + "incw x19\n" + "whilelt p2.s, x19, x11\n" + "incw x19\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 3f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "zip2 z12.d, z8.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "addvl x9, x9, #4\n" + "zip1 z9.d, z9.d, z9.d\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "5:" // Height 1: setup done + "mov x27, #0x0\n" + "6:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x26, #0x8\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "sub x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "cmp x26, #0x8\n" + "add x25, x25, #0x10\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "subs x26, x26, #0x4\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "ble 11f\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "11:" // Height 1: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 6b\n" + "uzp1 z8.d, z8.d, z12.d\n" + "uzp1 z9.d, z9.d, z13.d\n" + "uzp1 z10.d, z10.d, z14.d\n" + "uzp1 z11.d, z11.d, z15.d\n" + "tbz %x[flags], #1, 12f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "12:" // Height 1: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "13:" // Height 1: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 2b\n" + "b 80f\n" + "14:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "15:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x11\n" + "incw x19\n" + "whilelt p3.s, x19, x11\n" + "incw x19\n" + "whilelt p2.s, x19, x11\n" + "incw x19\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 16f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "zip2 z12.d, z8.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "addvl x9, x9, #4\n" + "zip1 z9.d, z9.d, z9.d\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "b 18f\n" + "16:" // Height 2: no bias + "tbz %x[flags], #0, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 18f\n" + "17:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "18:" // Height 2: setup done + "mov x27, #0x0\n" + "19:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 21f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "21:" // Height 2: input setup done + "cmp x26, #0x8\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "cmp x26, #0x8\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x24, x24, #0x10\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "ble 24f\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "24:" // Height 2: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 19b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "tbz %x[flags], #1, 25f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z7.s, p5/M, z7.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmax z7.s, p5/M, z7.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "25:" // Height 2: No activation + "st1w { z7.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "26:" // Height 2: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 15b\n" + "b 80f\n" + "27:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "28:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x11\n" + "incw x19\n" + "whilelt p3.s, x19, x11\n" + "incw x19\n" + "whilelt p2.s, x19, x11\n" + "incw x19\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 29f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "zip2 z12.d, z8.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "b 31f\n" + "29:" // Height 3: no bias + "tbz %x[flags], #0, 30f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "zip1 z16.d, z17.d, z20.d\n" + "zip2 z20.d, z17.d, z20.d\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 31f\n" + "30:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "31:" // Height 3: setup done + "mov x27, #0x0\n" + "32:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 34f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "34:" // Height 3: input setup done + "cmp x26, #0x8\n" + "ble 36f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "cmp x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "add x23, x23, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + "bgt 35b\n" + "36:" // Height 3: Multiply loop: Single iteration only + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "subs x26, x26, #0x4\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + "ble 37f\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + "37:" // Height 3: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 32b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z16.d, z16.d, z20.d\n" + "uzp1 z17.d, z17.d, z21.d\n" + "uzp1 z18.d, z18.d, z22.d\n" + "uzp1 z19.d, z19.d, z23.d\n" + "tbz %x[flags], #1, 38f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z7.s, p5/M, z7.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmax z7.s, p5/M, z7.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "38:" // Height 3: No activation + "st1w { z7.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "39:" // Height 3: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 28b\n" + "b 80f\n" + "40:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "41:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x11\n" + "incw x19\n" + "whilelt p3.s, x19, x11\n" + "incw x19\n" + "whilelt p2.s, x19, x11\n" + "incw x19\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 42f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "zip2 z12.d, z8.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "b 44f\n" + "42:" // Height 4: no bias + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 44f\n" + "43:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "44:" // Height 4: setup done + "mov x27, #0x0\n" + "45:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 47f\n" + "46:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "47:" // Height 4: input setup done + "cmp x26, #0x8\n" + "ble 49f\n" + "48:" // Height 4: Multiply loop: Main loop head + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "cmp x26, #0x8\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "add x23, x23, #0x10\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "add x22, x22, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + "bgt 48b\n" + "49:" // Height 4: Multiply loop: Single iteration only + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + "ble 50f\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + "50:" // Height 4: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 45b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "tbz %x[flags], #1, 51f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z7.s, p5/M, z7.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmax z7.s, p5/M, z7.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "51:" // Height 4: No activation + "st1w { z7.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z15.s }, p4, [x23]\n" + "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "52:" // Height 4: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 41b\n" + "b 80f\n" + "53:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "54:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x11\n" + "incw x19\n" + "whilelt p3.s, x19, x11\n" + "incw x19\n" + "whilelt p2.s, x19, x11\n" + "incw x19\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 55f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "zip2 z12.d, z8.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "mov z24.d, z8.d\n" + "mov z28.d, z12.d\n" + "mov z25.d, z9.d\n" + "mov z29.d, z13.d\n" + "mov z26.d, z10.d\n" + "mov z30.d, z14.d\n" + "mov z27.d, z11.d\n" + "mov z31.d, z15.d\n" + "b 57f\n" + "55:" // Height 5: no bias + "tbz %x[flags], #0, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z25.s }, p4/Z, [x21]\n" + "zip1 z17.d, z18.d, z21.d\n" + "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "zip1 z24.d, z25.d, z28.d\n" + "zip2 z28.d, z25.d, z28.d\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 57f\n" + "56:" // Height 5: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "57:" // Height 5: setup done + "mov x27, #0x0\n" + "58:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 60f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "b 60f\n" + "59:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "60:" // Height 5: input setup done + "cmp x26, #0x8\n" + "ble 62f\n" + "61:" // Height 5: Multiply loop: Main loop head + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "sub x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "cmp x26, #0x8\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "ld1rqh { z5.h }, p0/Z, [x21]\n" + "add x25, x25, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "add x24, x24, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x23, x23, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "add x22, x22, #0x10\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + "bgt 61b\n" + "62:" // Height 5: Multiply loop: Single iteration only + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "subs x26, x26, #0x4\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "ld1rqh { z5.h }, p0/Z, [x21]\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + "ble 63f\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + "63:" // Height 5: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 58b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z24.d, z24.d, z28.d\n" + "uzp1 z25.d, z25.d, z29.d\n" + "uzp1 z26.d, z26.d, z30.d\n" + "uzp1 z27.d, z27.d, z31.d\n" + "tbz %x[flags], #1, 64f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z7.s, p5/M, z7.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmax z7.s, p5/M, z7.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z1.s\n" + "fmax z26.s, p5/M, z26.s, z1.s\n" + "fmax z27.s, p5/M, z27.s, z1.s\n" + "64:" // Height 5: No activation + "st1w { z7.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z15.s }, p4, [x23]\n" + "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x21]\n" + "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "65:" // Height 5: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 54b\n" + "b 80f\n" + "66:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "67:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x11\n" + "incw x19\n" + "whilelt p3.s, x19, x11\n" + "incw x19\n" + "whilelt p2.s, x19, x11\n" + "incw x19\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 68f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "zip2 z12.d, z8.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "mov z24.d, z8.d\n" + "mov z28.d, z12.d\n" + "mov z25.d, z9.d\n" + "mov z29.d, z13.d\n" + "mov z26.d, z10.d\n" + "mov z30.d, z14.d\n" + "mov z27.d, z11.d\n" + "mov z31.d, z15.d\n" + "b 70f\n" + "68:" // Height 6: no bias + "tbz %x[flags], #0, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z17.d, z18.d, z21.d\n" + "ld1w { z25.s }, p4/Z, [x21]\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip2 z22.d, z19.d, z22.d\n" + "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip1 z19.d, z24.d, z23.d\n" + "ld1w { z28.s }, p4/Z, [x20]\n" + "zip2 z23.d, z24.d, z23.d\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z24.d, z25.d, z28.d\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip2 z28.d, z25.d, z28.d\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 70f\n" + "69:" // Height 6: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "70:" // Height 6: setup done + "mov x27, #0x0\n" + "71:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 72f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 73f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 73f\n" + "72:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "73:" // Height 6: input setup done + "cmp x26, #0x8\n" + "ble 75f\n" + "74:" // Height 6: Multiply loop: Main loop head + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "cmp x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x25, x25, #0x10\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "ld1rqh { z5.h }, p0/Z, [x21]\n" + "add x24, x24, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1rqh { z6.h }, p0/Z, [x20]\n" + "add x23, x23, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + "add x20, x20, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + "bgt 74b\n" + "75:" // Height 6: Multiply loop: Single iteration only + "ld1h { z7.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "subs x26, x26, #0x4\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z2.h }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z3.h }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "ld1rqh { z5.h }, p0/Z, [x21]\n" + ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" + "ld1rqh { z6.h }, p0/Z, [x20]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" + ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" + ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" + ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" + ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" + ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" + ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" + ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + "ble 76f\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" + ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" + ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" + ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" + ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" + ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" + ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" + ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" + ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #8\n" + ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" + ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" + ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + "76:" // Height 6: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 71b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z23.d, z24.d, z28.d\n" + "uzp2 z24.d, z24.d, z28.d\n" + "uzp1 z28.d, z25.d, z29.d\n" + "uzp2 z25.d, z25.d, z29.d\n" + "uzp1 z29.d, z26.d, z30.d\n" + "uzp2 z26.d, z26.d, z30.d\n" + "uzp1 z30.d, z27.d, z31.d\n" + "uzp2 z27.d, z27.d, z31.d\n" + "tbz %x[flags], #1, 77f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z7.s, p5/M, z7.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmax z7.s, p5/M, z7.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z23.s, p5/M, z23.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z23.s, p5/M, z23.s, z1.s\n" + "fmin z28.s, p5/M, z28.s, z0.s\n" + "fmin z29.s, p5/M, z29.s, z0.s\n" + "fmin z30.s, p5/M, z30.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmax z28.s, p5/M, z28.s, z1.s\n" + "fmax z29.s, p5/M, z29.s, z1.s\n" + "fmax z30.s, p5/M, z30.s, z1.s\n" + "fmax z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z1.s\n" + "fmax z26.s, p5/M, z26.s, z1.s\n" + "fmax z27.s, p5/M, z27.s, z1.s\n" + "77:" // Height 6: No activation + "st1w { z7.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z15.s }, p4, [x23]\n" + "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z23.s }, p4, [x21]\n" + "st1w { z28.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z29.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z30.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "78:" // Height 6: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 67b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 80f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 79f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "79:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "80:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp index f98ccdc7d3..6db9c0cdf3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp @@ -25,6 +25,7 @@ #ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -38,11 +39,13 @@ namespace arm_gemm { // Actual kernel implementations void sve_hybrid_fp16_mla_6x4VL( ARGLIST ); +void sve_hybrid_fp16_mla_6x4VL_a64fx( ARGLIST ); class cls_sve_hybrid_fp16_mla_6x4VL { public: - typedef __fp16 operand_type; + typedef __fp16 lhs_operand_type; + typedef __fp16 rhs_operand_type; typedef __fp16 result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,16 +71,41 @@ public: return true; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 15.90 }; + case CPUModel::A510: + return { 12.44 }; + case CPUModel::V1: + return { 31.51 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_fp16_mla_6x4VL; - cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *) + cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_hybrid_fp16_mla_6x4VL_a64fx; + break; + } } }; } // namespace arm_gemm #undef ARGLIST -#endif // ARM_COMPUTE_ENABLE_SVE + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp new file mode 100644 index 0000000000..11f5ed2c0a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp @@ -0,0 +1,1366 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_fp16_mla_6x4VL_a64fx ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg, + size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg, + const __fp16 *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + __fp16 maxval = static_cast<__fp16>(std::numeric_limits::infinity()); + __fp16 minval = - static_cast<__fp16>(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const __fp16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast<__fp16>(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p4.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 61f\n" + "cmp %x[M], #0x4\n" + "bgt 49f\n" + "beq 37f\n" + "cmp %x[M], #0x2\n" + "bgt 25f\n" + "beq 13f\n" + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p3.h, x19, x10\n" + "inch x19\n" + "whilelt p2.h, x19, x10\n" + "inch x19\n" + "whilelt p1.h, x19, x10\n" + "inch x19\n" + "whilelt p0.h, x19, x10\n" + "cbz x11, 3f\n" + "ld1h { z8.h }, p4/Z, [x11]\n" + "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n" + "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n" + "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1h { z8.h }, p3/Z, [x28]\n" + "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "5:" // Height 1: setup done + "mov x27, #0x0\n" + "6:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "subs x26, x26, #0x1\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x25, x25, #0x2\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "subs x26, x26, #0x1\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "addvl x9, x9, #4\n" + "bne 6b\n" + "tbz %x[flags], #1, 11f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z1.h }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z0.h }, p4/Z, [x19]\n" + "fmin z8.h, p4/M, z8.h, z1.h\n" + "fmin z9.h, p4/M, z9.h, z1.h\n" + "fmin z10.h, p4/M, z10.h, z1.h\n" + "fmin z11.h, p4/M, z11.h, z1.h\n" + "fmax z8.h, p4/M, z8.h, z0.h\n" + "fmax z9.h, p4/M, z9.h, z0.h\n" + "fmax z10.h, p4/M, z10.h, z0.h\n" + "fmax z11.h, p4/M, z11.h, z0.h\n" + "11:" // Height 1: No activation + "st1h { z8.h }, p3, [x28]\n" + "st1h { z9.h }, p2, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p1, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "12:" // Height 1: Writeback done + "dech x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 74f\n" + "13:" // Height 2 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "14:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p3.h, x19, x10\n" + "inch x19\n" + "whilelt p2.h, x19, x10\n" + "inch x19\n" + "whilelt p1.h, x19, x10\n" + "inch x19\n" + "whilelt p0.h, x19, x10\n" + "cbz x11, 15f\n" + "ld1h { z8.h }, p4/Z, [x11]\n" + "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n" + "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "b 17f\n" + "15:" // Height 2: no bias + "tbz %x[flags], #0, 16f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x28]\n" + "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x24]\n" + "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n" + "b 17f\n" + "16:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "17:" // Height 2: setup done + "mov x27, #0x0\n" + "18:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 20f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "20:" // Height 2: input setup done + "subs x26, x26, #0x1\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "ble 22f\n" + "21:" // Height 2: Multiply loop: Main loop + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x2\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "subs x26, x26, #0x1\n" + "add x24, x24, #0x2\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 21b\n" + "22:" // Height 2: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "addvl x9, x9, #4\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "bne 18b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "tbz %x[flags], #1, 23f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z1.h }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z0.h }, p4/Z, [x19]\n" + "fmin z8.h, p4/M, z8.h, z1.h\n" + "fmin z9.h, p4/M, z9.h, z1.h\n" + "fmin z10.h, p4/M, z10.h, z1.h\n" + "fmin z11.h, p4/M, z11.h, z1.h\n" + "fmin z12.h, p4/M, z12.h, z1.h\n" + "fmin z13.h, p4/M, z13.h, z1.h\n" + "fmin z14.h, p4/M, z14.h, z1.h\n" + "fmin z15.h, p4/M, z15.h, z1.h\n" + "fmax z8.h, p4/M, z8.h, z0.h\n" + "fmax z9.h, p4/M, z9.h, z0.h\n" + "fmax z10.h, p4/M, z10.h, z0.h\n" + "fmax z11.h, p4/M, z11.h, z0.h\n" + "fmax z12.h, p4/M, z12.h, z0.h\n" + "fmax z13.h, p4/M, z13.h, z0.h\n" + "fmax z14.h, p4/M, z14.h, z0.h\n" + "fmax z15.h, p4/M, z15.h, z0.h\n" + "23:" // Height 2: No activation + "st1h { z8.h }, p3, [x28]\n" + "st1h { z9.h }, p2, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p1, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p3, [x24]\n" + "st1h { z13.h }, p2, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p1, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p0, [x24, #3, MUL VL]\n" + "24:" // Height 2: Writeback done + "dech x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 14b\n" + "b 74f\n" + "25:" // Height 3 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "26:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p3.h, x19, x10\n" + "inch x19\n" + "whilelt p2.h, x19, x10\n" + "inch x19\n" + "whilelt p1.h, x19, x10\n" + "inch x19\n" + "whilelt p0.h, x19, x10\n" + "cbz x11, 27f\n" + "ld1h { z8.h }, p4/Z, [x11]\n" + "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n" + "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "b 29f\n" + "27:" // Height 3: no bias + "tbz %x[flags], #0, 28f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x28]\n" + "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x24]\n" + "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x23]\n" + "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n" + "b 29f\n" + "28:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "29:" // Height 3: setup done + "mov x27, #0x0\n" + "30:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 32f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 32f\n" + "31:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "32:" // Height 3: input setup done + "subs x26, x26, #0x1\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "ble 34f\n" + "33:" // Height 3: Multiply loop: Main loop + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x25, x25, #0x2\n" + "subs x26, x26, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x2\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x23, x23, #0x2\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 33b\n" + "34:" // Height 3: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x27, x27, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "bne 30b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "tbz %x[flags], #1, 35f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z1.h }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z0.h }, p4/Z, [x19]\n" + "fmin z8.h, p4/M, z8.h, z1.h\n" + "fmin z9.h, p4/M, z9.h, z1.h\n" + "fmin z10.h, p4/M, z10.h, z1.h\n" + "fmin z11.h, p4/M, z11.h, z1.h\n" + "fmin z12.h, p4/M, z12.h, z1.h\n" + "fmin z13.h, p4/M, z13.h, z1.h\n" + "fmin z14.h, p4/M, z14.h, z1.h\n" + "fmin z15.h, p4/M, z15.h, z1.h\n" + "fmin z16.h, p4/M, z16.h, z1.h\n" + "fmin z17.h, p4/M, z17.h, z1.h\n" + "fmin z18.h, p4/M, z18.h, z1.h\n" + "fmin z19.h, p4/M, z19.h, z1.h\n" + "fmax z8.h, p4/M, z8.h, z0.h\n" + "fmax z9.h, p4/M, z9.h, z0.h\n" + "fmax z10.h, p4/M, z10.h, z0.h\n" + "fmax z11.h, p4/M, z11.h, z0.h\n" + "fmax z12.h, p4/M, z12.h, z0.h\n" + "fmax z13.h, p4/M, z13.h, z0.h\n" + "fmax z14.h, p4/M, z14.h, z0.h\n" + "fmax z15.h, p4/M, z15.h, z0.h\n" + "fmax z16.h, p4/M, z16.h, z0.h\n" + "fmax z17.h, p4/M, z17.h, z0.h\n" + "fmax z18.h, p4/M, z18.h, z0.h\n" + "fmax z19.h, p4/M, z19.h, z0.h\n" + "35:" // Height 3: No activation + "st1h { z8.h }, p3, [x28]\n" + "st1h { z9.h }, p2, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p1, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p3, [x24]\n" + "st1h { z13.h }, p2, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p1, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p0, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p3, [x23]\n" + "st1h { z17.h }, p2, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p1, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p0, [x23, #3, MUL VL]\n" + "36:" // Height 3: Writeback done + "dech x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 26b\n" + "b 74f\n" + "37:" // Height 4 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "38:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p3.h, x19, x10\n" + "inch x19\n" + "whilelt p2.h, x19, x10\n" + "inch x19\n" + "whilelt p1.h, x19, x10\n" + "inch x19\n" + "whilelt p0.h, x19, x10\n" + "cbz x11, 39f\n" + "ld1h { z8.h }, p4/Z, [x11]\n" + "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n" + "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z20.d, z8.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "b 41f\n" + "39:" // Height 4: no bias + "tbz %x[flags], #0, 40f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x28]\n" + "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x24]\n" + "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x23]\n" + "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x22]\n" + "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n" + "b 41f\n" + "40:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "41:" // Height 4: setup done + "mov x27, #0x0\n" + "42:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 43f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 44f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 44f\n" + "43:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "44:" // Height 4: input setup done + "subs x26, x26, #0x1\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "ld1rh { z3.h }, p4/Z, [x22]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "ble 46f\n" + "45:" // Height 4: Multiply loop: Main loop + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x25, x25, #0x2\n" + "subs x26, x26, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z20.h, p4/M, z6.h, z3.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x2\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "add x23, x23, #0x2\n" + "add x22, x22, #0x2\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "fmla z21.h, p4/M, z7.h, z3.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z22.h, p4/M, z6.h, z3.h\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z23.h, p4/M, z7.h, z3.h\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "ld1rh { z3.h }, p4/Z, [x22]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 45b\n" + "46:" // Height 4: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x27, x27, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z20.h, p4/M, z6.h, z3.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "fmla z21.h, p4/M, z7.h, z3.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z22.h, p4/M, z6.h, z3.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z23.h, p4/M, z7.h, z3.h\n" + "bne 42b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "tbz %x[flags], #1, 47f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z1.h }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z0.h }, p4/Z, [x19]\n" + "fmin z8.h, p4/M, z8.h, z1.h\n" + "fmin z9.h, p4/M, z9.h, z1.h\n" + "fmin z10.h, p4/M, z10.h, z1.h\n" + "fmin z11.h, p4/M, z11.h, z1.h\n" + "fmin z12.h, p4/M, z12.h, z1.h\n" + "fmin z13.h, p4/M, z13.h, z1.h\n" + "fmin z14.h, p4/M, z14.h, z1.h\n" + "fmin z15.h, p4/M, z15.h, z1.h\n" + "fmin z16.h, p4/M, z16.h, z1.h\n" + "fmin z17.h, p4/M, z17.h, z1.h\n" + "fmin z18.h, p4/M, z18.h, z1.h\n" + "fmin z19.h, p4/M, z19.h, z1.h\n" + "fmin z20.h, p4/M, z20.h, z1.h\n" + "fmin z21.h, p4/M, z21.h, z1.h\n" + "fmin z22.h, p4/M, z22.h, z1.h\n" + "fmin z23.h, p4/M, z23.h, z1.h\n" + "fmax z8.h, p4/M, z8.h, z0.h\n" + "fmax z9.h, p4/M, z9.h, z0.h\n" + "fmax z10.h, p4/M, z10.h, z0.h\n" + "fmax z11.h, p4/M, z11.h, z0.h\n" + "fmax z12.h, p4/M, z12.h, z0.h\n" + "fmax z13.h, p4/M, z13.h, z0.h\n" + "fmax z14.h, p4/M, z14.h, z0.h\n" + "fmax z15.h, p4/M, z15.h, z0.h\n" + "fmax z16.h, p4/M, z16.h, z0.h\n" + "fmax z17.h, p4/M, z17.h, z0.h\n" + "fmax z18.h, p4/M, z18.h, z0.h\n" + "fmax z19.h, p4/M, z19.h, z0.h\n" + "fmax z20.h, p4/M, z20.h, z0.h\n" + "fmax z21.h, p4/M, z21.h, z0.h\n" + "fmax z22.h, p4/M, z22.h, z0.h\n" + "fmax z23.h, p4/M, z23.h, z0.h\n" + "47:" // Height 4: No activation + "st1h { z8.h }, p3, [x28]\n" + "st1h { z9.h }, p2, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p1, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p3, [x24]\n" + "st1h { z13.h }, p2, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p1, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p0, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p3, [x23]\n" + "st1h { z17.h }, p2, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p1, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p0, [x23, #3, MUL VL]\n" + "st1h { z20.h }, p3, [x22]\n" + "st1h { z21.h }, p2, [x22, #1, MUL VL]\n" + "st1h { z22.h }, p1, [x22, #2, MUL VL]\n" + "st1h { z23.h }, p0, [x22, #3, MUL VL]\n" + "48:" // Height 4: Writeback done + "dech x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 38b\n" + "b 74f\n" + "49:" // Height 5 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "50:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p3.h, x19, x10\n" + "inch x19\n" + "whilelt p2.h, x19, x10\n" + "inch x19\n" + "whilelt p1.h, x19, x10\n" + "inch x19\n" + "whilelt p0.h, x19, x10\n" + "cbz x11, 51f\n" + "ld1h { z8.h }, p4/Z, [x11]\n" + "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n" + "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z20.d, z8.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "b 53f\n" + "51:" // Height 5: no bias + "tbz %x[flags], #0, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x28]\n" + "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x24]\n" + "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x23]\n" + "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x22]\n" + "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p3/Z, [x21]\n" + "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n" + "b 53f\n" + "52:" // Height 5: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "53:" // Height 5: setup done + "mov x27, #0x0\n" + "54:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 55f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "b 56f\n" + "55:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "56:" // Height 5: input setup done + "subs x26, x26, #0x1\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "ld1rh { z3.h }, p4/Z, [x22]\n" + "ld1rh { z4.h }, p4/Z, [x21]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "ble 58f\n" + "57:" // Height 5: Multiply loop: Main loop + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x25, x25, #0x2\n" + "subs x26, x26, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z20.h, p4/M, z6.h, z3.h\n" + "add x24, x24, #0x2\n" + "add x23, x23, #0x2\n" + "fmla z24.h, p4/M, z6.h, z4.h\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x2\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "add x21, x21, #0x2\n" + "fmla z21.h, p4/M, z7.h, z3.h\n" + "fmla z25.h, p4/M, z7.h, z4.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z22.h, p4/M, z6.h, z3.h\n" + "fmla z26.h, p4/M, z6.h, z4.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "fmla z23.h, p4/M, z7.h, z3.h\n" + "fmla z27.h, p4/M, z7.h, z4.h\n" + "ld1rh { z3.h }, p4/Z, [x22]\n" + "ld1rh { z4.h }, p4/Z, [x21]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 57b\n" + "58:" // Height 5: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x27, x27, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z20.h, p4/M, z6.h, z3.h\n" + "cmp x27, x19\n" + "fmla z24.h, p4/M, z6.h, z4.h\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "fmla z21.h, p4/M, z7.h, z3.h\n" + "fmla z25.h, p4/M, z7.h, z4.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z22.h, p4/M, z6.h, z3.h\n" + "fmla z26.h, p4/M, z6.h, z4.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z23.h, p4/M, z7.h, z3.h\n" + "fmla z27.h, p4/M, z7.h, z4.h\n" + "bne 54b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "tbz %x[flags], #1, 59f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z1.h }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z0.h }, p4/Z, [x19]\n" + "fmin z8.h, p4/M, z8.h, z1.h\n" + "fmin z9.h, p4/M, z9.h, z1.h\n" + "fmin z10.h, p4/M, z10.h, z1.h\n" + "fmin z11.h, p4/M, z11.h, z1.h\n" + "fmin z12.h, p4/M, z12.h, z1.h\n" + "fmin z13.h, p4/M, z13.h, z1.h\n" + "fmin z14.h, p4/M, z14.h, z1.h\n" + "fmin z15.h, p4/M, z15.h, z1.h\n" + "fmin z16.h, p4/M, z16.h, z1.h\n" + "fmin z17.h, p4/M, z17.h, z1.h\n" + "fmin z18.h, p4/M, z18.h, z1.h\n" + "fmin z19.h, p4/M, z19.h, z1.h\n" + "fmin z20.h, p4/M, z20.h, z1.h\n" + "fmin z21.h, p4/M, z21.h, z1.h\n" + "fmin z22.h, p4/M, z22.h, z1.h\n" + "fmin z23.h, p4/M, z23.h, z1.h\n" + "fmin z24.h, p4/M, z24.h, z1.h\n" + "fmin z25.h, p4/M, z25.h, z1.h\n" + "fmin z26.h, p4/M, z26.h, z1.h\n" + "fmin z27.h, p4/M, z27.h, z1.h\n" + "fmax z8.h, p4/M, z8.h, z0.h\n" + "fmax z9.h, p4/M, z9.h, z0.h\n" + "fmax z10.h, p4/M, z10.h, z0.h\n" + "fmax z11.h, p4/M, z11.h, z0.h\n" + "fmax z12.h, p4/M, z12.h, z0.h\n" + "fmax z13.h, p4/M, z13.h, z0.h\n" + "fmax z14.h, p4/M, z14.h, z0.h\n" + "fmax z15.h, p4/M, z15.h, z0.h\n" + "fmax z16.h, p4/M, z16.h, z0.h\n" + "fmax z17.h, p4/M, z17.h, z0.h\n" + "fmax z18.h, p4/M, z18.h, z0.h\n" + "fmax z19.h, p4/M, z19.h, z0.h\n" + "fmax z20.h, p4/M, z20.h, z0.h\n" + "fmax z21.h, p4/M, z21.h, z0.h\n" + "fmax z22.h, p4/M, z22.h, z0.h\n" + "fmax z23.h, p4/M, z23.h, z0.h\n" + "fmax z24.h, p4/M, z24.h, z0.h\n" + "fmax z25.h, p4/M, z25.h, z0.h\n" + "fmax z26.h, p4/M, z26.h, z0.h\n" + "fmax z27.h, p4/M, z27.h, z0.h\n" + "59:" // Height 5: No activation + "st1h { z8.h }, p3, [x28]\n" + "st1h { z9.h }, p2, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p1, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p3, [x24]\n" + "st1h { z13.h }, p2, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p1, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p0, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p3, [x23]\n" + "st1h { z17.h }, p2, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p1, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p0, [x23, #3, MUL VL]\n" + "st1h { z20.h }, p3, [x22]\n" + "st1h { z21.h }, p2, [x22, #1, MUL VL]\n" + "st1h { z22.h }, p1, [x22, #2, MUL VL]\n" + "st1h { z23.h }, p0, [x22, #3, MUL VL]\n" + "st1h { z24.h }, p3, [x21]\n" + "st1h { z25.h }, p2, [x21, #1, MUL VL]\n" + "st1h { z26.h }, p1, [x21, #2, MUL VL]\n" + "st1h { z27.h }, p0, [x21, #3, MUL VL]\n" + "60:" // Height 5: Writeback done + "dech x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 50b\n" + "b 74f\n" + "61:" // Height 6 + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0xc\n" + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "62:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p3.h, x19, x10\n" + "inch x19\n" + "whilelt p2.h, x19, x10\n" + "inch x19\n" + "whilelt p1.h, x19, x10\n" + "inch x19\n" + "whilelt p0.h, x19, x10\n" + "cbz x11, 63f\n" + "ld1h { z8.h }, p4/Z, [x11]\n" + "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n" + "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z20.d, z8.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "mov z28.d, z8.d\n" + "mov z29.d, z9.d\n" + "mov z30.d, z10.d\n" + "mov z31.d, z11.d\n" + "b 65f\n" + "63:" // Height 6: no bias + "tbz %x[flags], #0, 64f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x28]\n" + "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n" + "add x20, x21, x19, LSL #1\n" + "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x24]\n" + "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x23]\n" + "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x22]\n" + "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p3/Z, [x21]\n" + "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z28.h }, p3/Z, [x20]\n" + "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n" + "b 65f\n" + "64:" // Height 6: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "65:" // Height 6: setup done + "mov x27, #0x0\n" + "66:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 67f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 68f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 68f\n" + "67:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "68:" // Height 6: input setup done + "subs x26, x26, #0x1\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "ld1rh { z3.h }, p4/Z, [x22]\n" + "ld1rh { z4.h }, p4/Z, [x21]\n" + "ld1rh { z5.h }, p4/Z, [x20]\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "ble 70f\n" + "69:" // Height 6: Multiply loop: Main loop + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x25, x25, #0x2\n" + "subs x26, x26, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z20.h, p4/M, z6.h, z3.h\n" + "add x24, x24, #0x2\n" + "add x23, x23, #0x2\n" + "fmla z24.h, p4/M, z6.h, z4.h\n" + "fmla z28.h, p4/M, z6.h, z5.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x2\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "add x21, x21, #0x2\n" + "add x20, x20, #0x2\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "fmla z21.h, p4/M, z7.h, z3.h\n" + "fmla z25.h, p4/M, z7.h, z4.h\n" + "fmla z29.h, p4/M, z7.h, z5.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z22.h, p4/M, z6.h, z3.h\n" + "fmla z26.h, p4/M, z6.h, z4.h\n" + "fmla z30.h, p4/M, z6.h, z5.h\n" + "ld1h { z6.h }, p4/Z, [x9]\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "ld1rh { z0.h }, p4/Z, [x25]\n" + "ld1rh { z1.h }, p4/Z, [x24]\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z23.h, p4/M, z7.h, z3.h\n" + "ld1rh { z2.h }, p4/Z, [x23]\n" + "ld1rh { z3.h }, p4/Z, [x22]\n" + "fmla z27.h, p4/M, z7.h, z4.h\n" + "fmla z31.h, p4/M, z7.h, z5.h\n" + "ld1rh { z4.h }, p4/Z, [x21]\n" + "ld1rh { z5.h }, p4/Z, [x20]\n" + "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 69b\n" + "70:" // Height 6: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.h, p4/M, z6.h, z0.h\n" + "fmla z12.h, p4/M, z6.h, z1.h\n" + "add x27, x27, #0x1\n" + "fmla z16.h, p4/M, z6.h, z2.h\n" + "fmla z20.h, p4/M, z6.h, z3.h\n" + "cmp x27, x19\n" + "fmla z24.h, p4/M, z6.h, z4.h\n" + "fmla z28.h, p4/M, z6.h, z5.h\n" + "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n" + "fmla z9.h, p4/M, z7.h, z0.h\n" + "fmla z13.h, p4/M, z7.h, z1.h\n" + "fmla z17.h, p4/M, z7.h, z2.h\n" + "fmla z21.h, p4/M, z7.h, z3.h\n" + "fmla z25.h, p4/M, z7.h, z4.h\n" + "fmla z29.h, p4/M, z7.h, z5.h\n" + "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.h, p4/M, z6.h, z0.h\n" + "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z18.h, p4/M, z6.h, z2.h\n" + "fmla z22.h, p4/M, z6.h, z3.h\n" + "fmla z26.h, p4/M, z6.h, z4.h\n" + "fmla z30.h, p4/M, z6.h, z5.h\n" + "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z23.h, p4/M, z7.h, z3.h\n" + "fmla z27.h, p4/M, z7.h, z4.h\n" + "fmla z31.h, p4/M, z7.h, z5.h\n" + "bne 66b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "tbz %x[flags], #1, 71f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z1.h }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z0.h }, p4/Z, [x19]\n" + "fmin z8.h, p4/M, z8.h, z1.h\n" + "fmin z9.h, p4/M, z9.h, z1.h\n" + "fmin z10.h, p4/M, z10.h, z1.h\n" + "fmin z11.h, p4/M, z11.h, z1.h\n" + "fmin z12.h, p4/M, z12.h, z1.h\n" + "fmin z13.h, p4/M, z13.h, z1.h\n" + "fmin z14.h, p4/M, z14.h, z1.h\n" + "fmin z15.h, p4/M, z15.h, z1.h\n" + "fmin z16.h, p4/M, z16.h, z1.h\n" + "fmin z17.h, p4/M, z17.h, z1.h\n" + "fmin z18.h, p4/M, z18.h, z1.h\n" + "fmin z19.h, p4/M, z19.h, z1.h\n" + "fmin z20.h, p4/M, z20.h, z1.h\n" + "fmin z21.h, p4/M, z21.h, z1.h\n" + "fmin z22.h, p4/M, z22.h, z1.h\n" + "fmin z23.h, p4/M, z23.h, z1.h\n" + "fmin z24.h, p4/M, z24.h, z1.h\n" + "fmin z25.h, p4/M, z25.h, z1.h\n" + "fmin z26.h, p4/M, z26.h, z1.h\n" + "fmin z27.h, p4/M, z27.h, z1.h\n" + "fmin z28.h, p4/M, z28.h, z1.h\n" + "fmin z29.h, p4/M, z29.h, z1.h\n" + "fmin z30.h, p4/M, z30.h, z1.h\n" + "fmin z31.h, p4/M, z31.h, z1.h\n" + "fmax z8.h, p4/M, z8.h, z0.h\n" + "fmax z9.h, p4/M, z9.h, z0.h\n" + "fmax z10.h, p4/M, z10.h, z0.h\n" + "fmax z11.h, p4/M, z11.h, z0.h\n" + "fmax z12.h, p4/M, z12.h, z0.h\n" + "fmax z13.h, p4/M, z13.h, z0.h\n" + "fmax z14.h, p4/M, z14.h, z0.h\n" + "fmax z15.h, p4/M, z15.h, z0.h\n" + "fmax z16.h, p4/M, z16.h, z0.h\n" + "fmax z17.h, p4/M, z17.h, z0.h\n" + "fmax z18.h, p4/M, z18.h, z0.h\n" + "fmax z19.h, p4/M, z19.h, z0.h\n" + "fmax z20.h, p4/M, z20.h, z0.h\n" + "fmax z21.h, p4/M, z21.h, z0.h\n" + "fmax z22.h, p4/M, z22.h, z0.h\n" + "fmax z23.h, p4/M, z23.h, z0.h\n" + "fmax z24.h, p4/M, z24.h, z0.h\n" + "fmax z25.h, p4/M, z25.h, z0.h\n" + "fmax z26.h, p4/M, z26.h, z0.h\n" + "fmax z27.h, p4/M, z27.h, z0.h\n" + "fmax z28.h, p4/M, z28.h, z0.h\n" + "fmax z29.h, p4/M, z29.h, z0.h\n" + "fmax z30.h, p4/M, z30.h, z0.h\n" + "fmax z31.h, p4/M, z31.h, z0.h\n" + "71:" // Height 6: No activation + "st1h { z8.h }, p3, [x28]\n" + "st1h { z9.h }, p2, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p1, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p3, [x24]\n" + "st1h { z13.h }, p2, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p1, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p0, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p3, [x23]\n" + "st1h { z17.h }, p2, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p1, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p0, [x23, #3, MUL VL]\n" + "st1h { z20.h }, p3, [x22]\n" + "st1h { z21.h }, p2, [x22, #1, MUL VL]\n" + "st1h { z22.h }, p1, [x22, #2, MUL VL]\n" + "st1h { z23.h }, p0, [x22, #3, MUL VL]\n" + "st1h { z24.h }, p3, [x21]\n" + "st1h { z25.h }, p2, [x21, #1, MUL VL]\n" + "st1h { z26.h }, p1, [x21, #2, MUL VL]\n" + "st1h { z27.h }, p0, [x21, #3, MUL VL]\n" + "st1h { z28.h }, p3, [x20]\n" + "st1h { z29.h }, p2, [x20, #1, MUL VL]\n" + "st1h { z30.h }, p1, [x20, #2, MUL VL]\n" + "st1h { z31.h }, p0, [x20, #3, MUL VL]\n" + "72:" // Height 6: Writeback done + "dech x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 62b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 74f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 73f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "73:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "74:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp index c151179a1f..09d5d8d96d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp @@ -161,13 +161,12 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "cmp x26, #0x8\n" + "add x25, x25, #0x10\n" "fmla z10.h, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z11.h, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[1]\n" @@ -235,7 +234,6 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -318,9 +316,8 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z10.h, z6.h, z0.h[7]\n" "fmla z11.h, z7.h, z0.h[7]\n" "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 6b\n" "tbz %x[flags], #1, 12f\n" @@ -424,16 +421,14 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" "fmla z9.h, z7.h, z0.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x8\n" "fmla z13.h, z7.h, z1.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" @@ -533,9 +528,7 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z13.h, z7.h, z1.h[0]\n" @@ -650,10 +643,8 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z11.h, z7.h, z0.h[7]\n" "fmla z15.h, z7.h, z1.h[7]\n" "24:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 19b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -787,21 +778,18 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x8\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z17.h, z7.h, z2.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" @@ -931,12 +919,9 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "add x23, x23, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" "fmla z16.h, z6.h, z2.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" @@ -1082,11 +1067,8 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[7]\n" "fmla z19.h, z7.h, z2.h[7]\n" "37:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1249,26 +1231,22 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" + "add x22, x22, #0x10\n" + "fmla z17.h, z7.h, z2.h[0]\n" "fmla z20.h, z6.h, z3.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.h, z7.h, z3.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" @@ -1428,19 +1406,15 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "add x22, x22, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z16.h, z6.h, z2.h[0]\n" "fmla z20.h, z6.h, z3.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z17.h, z7.h, z2.h[0]\n" "fmla z21.h, z7.h, z3.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -1613,12 +1587,8 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[7]\n" "fmla z23.h, z7.h, z3.h[7]\n" "50:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 45b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1810,32 +1780,27 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla z17.h, z7.h, z2.h[0]\n" "add x21, x21, #0x10\n" "fmla z20.h, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" "fmla z24.h, z6.h, z4.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.h, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.h, z7.h, z4.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" "fmla z22.h, z6.h, z3.h[0]\n" @@ -2024,22 +1989,17 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z16.h, z6.h, z2.h[0]\n" + "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" - "add x21, x21, #0x10\n" - "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z16.h, z6.h, z2.h[0]\n" "fmla z20.h, z6.h, z3.h[0]\n" "fmla z24.h, z6.h, z4.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z17.h, z7.h, z2.h[0]\n" "fmla z21.h, z7.h, z3.h[0]\n" "fmla z25.h, z7.h, z4.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" @@ -2243,13 +2203,8 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[7]\n" "fmla z27.h, z7.h, z4.h[7]\n" "63:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 58b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -2473,37 +2428,31 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "fmla z20.h, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "fmla z17.h, z7.h, z2.h[0]\n" "add x20, x20, #0x10\n" "fmla z24.h, z6.h, z4.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" "fmla z28.h, z6.h, z5.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.h, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.h, z7.h, z4.h[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z29.h, z7.h, z5.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x20, #0x80]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" "fmla z22.h, z6.h, z3.h[0]\n" @@ -2722,25 +2671,19 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z16.h, z6.h, z2.h[0]\n" + "fmla z12.h, z6.h, z1.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" "fmla z20.h, z6.h, z3.h[0]\n" - "add x20, x20, #0x10\n" - "fmla z17.h, z7.h, z2.h[0]\n" "fmla z24.h, z6.h, z4.h[0]\n" "fmla z28.h, z6.h, z5.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z17.h, z7.h, z2.h[0]\n" "fmla z21.h, z7.h, z3.h[0]\n" "fmla z25.h, z7.h, z4.h[0]\n" "fmla z29.h, z7.h, z5.h[0]\n" @@ -2975,14 +2918,8 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z27.h, z7.h, z4.h[7]\n" "fmla z31.h, z7.h, z5.h[7]\n" "76:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 71b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp index 4c0a3a11e0..1c140e0c02 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp @@ -22,9 +22,10 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -38,11 +39,13 @@ namespace arm_gemm { // Actual kernel implementations void sve_hybrid_fp32_mla_6x4VL( ARGLIST ); +void sve_hybrid_fp32_mla_6x4VL_a64fx( ARGLIST ); class cls_sve_hybrid_fp32_mla_6x4VL { public: - typedef float operand_type; + typedef float lhs_operand_type; + typedef float rhs_operand_type; typedef float result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,16 +71,37 @@ public: return true; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 6.667 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_fp32_mla_6x4VL; - cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *) + cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_hybrid_fp32_mla_6x4VL_a64fx; + break; + } } }; } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp new file mode 100644 index 0000000000..30b6a54277 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp @@ -0,0 +1,1366 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_fp32_mla_6x4VL_a64fx ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p4.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 61f\n" + "cmp %x[M], #0x4\n" + "bgt 49f\n" + "beq 37f\n" + "cmp %x[M], #0x2\n" + "bgt 25f\n" + "beq 13f\n" + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "cbz x11, 3f\n" + "ld1w { z8.s }, p4/Z, [x11]\n" + "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n" + "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "5:" // Height 1: setup done + "mov x27, #0x0\n" + "6:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "subs x26, x26, #0x1\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x25, x25, #0x4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "subs x26, x26, #0x1\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "addvl x9, x9, #4\n" + "bne 6b\n" + "tbz %x[flags], #1, 11f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p4/Z, [x19]\n" + "fmin z8.s, p4/M, z8.s, z1.s\n" + "fmin z9.s, p4/M, z9.s, z1.s\n" + "fmin z10.s, p4/M, z10.s, z1.s\n" + "fmin z11.s, p4/M, z11.s, z1.s\n" + "fmax z8.s, p4/M, z8.s, z0.s\n" + "fmax z9.s, p4/M, z9.s, z0.s\n" + "fmax z10.s, p4/M, z10.s, z0.s\n" + "fmax z11.s, p4/M, z11.s, z0.s\n" + "11:" // Height 1: No activation + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "12:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 74f\n" + "13:" // Height 2 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "14:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "cbz x11, 15f\n" + "ld1w { z8.s }, p4/Z, [x11]\n" + "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "b 17f\n" + "15:" // Height 2: no bias + "tbz %x[flags], #0, 16f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x24]\n" + "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "b 17f\n" + "16:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "17:" // Height 2: setup done + "mov x27, #0x0\n" + "18:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 20f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "20:" // Height 2: input setup done + "subs x26, x26, #0x1\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "ble 22f\n" + "21:" // Height 2: Multiply loop: Main loop + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x4\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "subs x26, x26, #0x1\n" + "add x24, x24, #0x4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 21b\n" + "22:" // Height 2: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "addvl x9, x9, #4\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "bne 18b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "tbz %x[flags], #1, 23f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p4/Z, [x19]\n" + "fmin z8.s, p4/M, z8.s, z1.s\n" + "fmin z9.s, p4/M, z9.s, z1.s\n" + "fmin z10.s, p4/M, z10.s, z1.s\n" + "fmin z11.s, p4/M, z11.s, z1.s\n" + "fmin z12.s, p4/M, z12.s, z1.s\n" + "fmin z13.s, p4/M, z13.s, z1.s\n" + "fmin z14.s, p4/M, z14.s, z1.s\n" + "fmin z15.s, p4/M, z15.s, z1.s\n" + "fmax z8.s, p4/M, z8.s, z0.s\n" + "fmax z9.s, p4/M, z9.s, z0.s\n" + "fmax z10.s, p4/M, z10.s, z0.s\n" + "fmax z11.s, p4/M, z11.s, z0.s\n" + "fmax z12.s, p4/M, z12.s, z0.s\n" + "fmax z13.s, p4/M, z13.s, z0.s\n" + "fmax z14.s, p4/M, z14.s, z0.s\n" + "fmax z15.s, p4/M, z15.s, z0.s\n" + "23:" // Height 2: No activation + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x24]\n" + "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" + "24:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 14b\n" + "b 74f\n" + "25:" // Height 3 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "26:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "cbz x11, 27f\n" + "ld1w { z8.s }, p4/Z, [x11]\n" + "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "b 29f\n" + "27:" // Height 3: no bias + "tbz %x[flags], #0, 28f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x24]\n" + "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23]\n" + "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "b 29f\n" + "28:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "29:" // Height 3: setup done + "mov x27, #0x0\n" + "30:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 32f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 32f\n" + "31:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "32:" // Height 3: input setup done + "subs x26, x26, #0x1\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "ble 34f\n" + "33:" // Height 3: Multiply loop: Main loop + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x4\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x23, x23, #0x4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 33b\n" + "34:" // Height 3: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x27, x27, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "bne 30b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "tbz %x[flags], #1, 35f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p4/Z, [x19]\n" + "fmin z8.s, p4/M, z8.s, z1.s\n" + "fmin z9.s, p4/M, z9.s, z1.s\n" + "fmin z10.s, p4/M, z10.s, z1.s\n" + "fmin z11.s, p4/M, z11.s, z1.s\n" + "fmin z12.s, p4/M, z12.s, z1.s\n" + "fmin z13.s, p4/M, z13.s, z1.s\n" + "fmin z14.s, p4/M, z14.s, z1.s\n" + "fmin z15.s, p4/M, z15.s, z1.s\n" + "fmin z16.s, p4/M, z16.s, z1.s\n" + "fmin z17.s, p4/M, z17.s, z1.s\n" + "fmin z18.s, p4/M, z18.s, z1.s\n" + "fmin z19.s, p4/M, z19.s, z1.s\n" + "fmax z8.s, p4/M, z8.s, z0.s\n" + "fmax z9.s, p4/M, z9.s, z0.s\n" + "fmax z10.s, p4/M, z10.s, z0.s\n" + "fmax z11.s, p4/M, z11.s, z0.s\n" + "fmax z12.s, p4/M, z12.s, z0.s\n" + "fmax z13.s, p4/M, z13.s, z0.s\n" + "fmax z14.s, p4/M, z14.s, z0.s\n" + "fmax z15.s, p4/M, z15.s, z0.s\n" + "fmax z16.s, p4/M, z16.s, z0.s\n" + "fmax z17.s, p4/M, z17.s, z0.s\n" + "fmax z18.s, p4/M, z18.s, z0.s\n" + "fmax z19.s, p4/M, z19.s, z0.s\n" + "35:" // Height 3: No activation + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x24]\n" + "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x23]\n" + "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" + "36:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 26b\n" + "b 74f\n" + "37:" // Height 4 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "38:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "cbz x11, 39f\n" + "ld1w { z8.s }, p4/Z, [x11]\n" + "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z20.d, z8.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "b 41f\n" + "39:" // Height 4: no bias + "tbz %x[flags], #0, 40f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x24]\n" + "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23]\n" + "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x22]\n" + "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" + "b 41f\n" + "40:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "41:" // Height 4: setup done + "mov x27, #0x0\n" + "42:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 43f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 44f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 44f\n" + "43:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "44:" // Height 4: input setup done + "subs x26, x26, #0x1\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "ble 46f\n" + "45:" // Height 4: Multiply loop: Main loop + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z20.s, p4/M, z6.s, z3.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x4\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "add x23, x23, #0x4\n" + "add x22, x22, #0x4\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "fmla z21.s, p4/M, z7.s, z3.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z22.s, p4/M, z6.s, z3.s\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z23.s, p4/M, z7.s, z3.s\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 45b\n" + "46:" // Height 4: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x27, x27, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z20.s, p4/M, z6.s, z3.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "fmla z21.s, p4/M, z7.s, z3.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z22.s, p4/M, z6.s, z3.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z23.s, p4/M, z7.s, z3.s\n" + "bne 42b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "tbz %x[flags], #1, 47f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p4/Z, [x19]\n" + "fmin z8.s, p4/M, z8.s, z1.s\n" + "fmin z9.s, p4/M, z9.s, z1.s\n" + "fmin z10.s, p4/M, z10.s, z1.s\n" + "fmin z11.s, p4/M, z11.s, z1.s\n" + "fmin z12.s, p4/M, z12.s, z1.s\n" + "fmin z13.s, p4/M, z13.s, z1.s\n" + "fmin z14.s, p4/M, z14.s, z1.s\n" + "fmin z15.s, p4/M, z15.s, z1.s\n" + "fmin z16.s, p4/M, z16.s, z1.s\n" + "fmin z17.s, p4/M, z17.s, z1.s\n" + "fmin z18.s, p4/M, z18.s, z1.s\n" + "fmin z19.s, p4/M, z19.s, z1.s\n" + "fmin z20.s, p4/M, z20.s, z1.s\n" + "fmin z21.s, p4/M, z21.s, z1.s\n" + "fmin z22.s, p4/M, z22.s, z1.s\n" + "fmin z23.s, p4/M, z23.s, z1.s\n" + "fmax z8.s, p4/M, z8.s, z0.s\n" + "fmax z9.s, p4/M, z9.s, z0.s\n" + "fmax z10.s, p4/M, z10.s, z0.s\n" + "fmax z11.s, p4/M, z11.s, z0.s\n" + "fmax z12.s, p4/M, z12.s, z0.s\n" + "fmax z13.s, p4/M, z13.s, z0.s\n" + "fmax z14.s, p4/M, z14.s, z0.s\n" + "fmax z15.s, p4/M, z15.s, z0.s\n" + "fmax z16.s, p4/M, z16.s, z0.s\n" + "fmax z17.s, p4/M, z17.s, z0.s\n" + "fmax z18.s, p4/M, z18.s, z0.s\n" + "fmax z19.s, p4/M, z19.s, z0.s\n" + "fmax z20.s, p4/M, z20.s, z0.s\n" + "fmax z21.s, p4/M, z21.s, z0.s\n" + "fmax z22.s, p4/M, z22.s, z0.s\n" + "fmax z23.s, p4/M, z23.s, z0.s\n" + "47:" // Height 4: No activation + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x24]\n" + "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x23]\n" + "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x22]\n" + "st1w { z21.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x22, #3, MUL VL]\n" + "48:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 38b\n" + "b 74f\n" + "49:" // Height 5 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "50:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "cbz x11, 51f\n" + "ld1w { z8.s }, p4/Z, [x11]\n" + "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z20.d, z8.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "b 53f\n" + "51:" // Height 5: no bias + "tbz %x[flags], #0, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x24]\n" + "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23]\n" + "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x22]\n" + "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x21]\n" + "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n" + "b 53f\n" + "52:" // Height 5: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "53:" // Height 5: setup done + "mov x27, #0x0\n" + "54:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 55f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 56f\n" + "55:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "56:" // Height 5: input setup done + "subs x26, x26, #0x1\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "ble 58f\n" + "57:" // Height 5: Multiply loop: Main loop + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z20.s, p4/M, z6.s, z3.s\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "fmla z24.s, p4/M, z6.s, z4.s\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x4\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "add x21, x21, #0x4\n" + "fmla z21.s, p4/M, z7.s, z3.s\n" + "fmla z25.s, p4/M, z7.s, z4.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z22.s, p4/M, z6.s, z3.s\n" + "fmla z26.s, p4/M, z6.s, z4.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "fmla z23.s, p4/M, z7.s, z3.s\n" + "fmla z27.s, p4/M, z7.s, z4.s\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 57b\n" + "58:" // Height 5: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x27, x27, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z20.s, p4/M, z6.s, z3.s\n" + "cmp x27, x19\n" + "fmla z24.s, p4/M, z6.s, z4.s\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "fmla z21.s, p4/M, z7.s, z3.s\n" + "fmla z25.s, p4/M, z7.s, z4.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z22.s, p4/M, z6.s, z3.s\n" + "fmla z26.s, p4/M, z6.s, z4.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z23.s, p4/M, z7.s, z3.s\n" + "fmla z27.s, p4/M, z7.s, z4.s\n" + "bne 54b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "tbz %x[flags], #1, 59f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p4/Z, [x19]\n" + "fmin z8.s, p4/M, z8.s, z1.s\n" + "fmin z9.s, p4/M, z9.s, z1.s\n" + "fmin z10.s, p4/M, z10.s, z1.s\n" + "fmin z11.s, p4/M, z11.s, z1.s\n" + "fmin z12.s, p4/M, z12.s, z1.s\n" + "fmin z13.s, p4/M, z13.s, z1.s\n" + "fmin z14.s, p4/M, z14.s, z1.s\n" + "fmin z15.s, p4/M, z15.s, z1.s\n" + "fmin z16.s, p4/M, z16.s, z1.s\n" + "fmin z17.s, p4/M, z17.s, z1.s\n" + "fmin z18.s, p4/M, z18.s, z1.s\n" + "fmin z19.s, p4/M, z19.s, z1.s\n" + "fmin z20.s, p4/M, z20.s, z1.s\n" + "fmin z21.s, p4/M, z21.s, z1.s\n" + "fmin z22.s, p4/M, z22.s, z1.s\n" + "fmin z23.s, p4/M, z23.s, z1.s\n" + "fmin z24.s, p4/M, z24.s, z1.s\n" + "fmin z25.s, p4/M, z25.s, z1.s\n" + "fmin z26.s, p4/M, z26.s, z1.s\n" + "fmin z27.s, p4/M, z27.s, z1.s\n" + "fmax z8.s, p4/M, z8.s, z0.s\n" + "fmax z9.s, p4/M, z9.s, z0.s\n" + "fmax z10.s, p4/M, z10.s, z0.s\n" + "fmax z11.s, p4/M, z11.s, z0.s\n" + "fmax z12.s, p4/M, z12.s, z0.s\n" + "fmax z13.s, p4/M, z13.s, z0.s\n" + "fmax z14.s, p4/M, z14.s, z0.s\n" + "fmax z15.s, p4/M, z15.s, z0.s\n" + "fmax z16.s, p4/M, z16.s, z0.s\n" + "fmax z17.s, p4/M, z17.s, z0.s\n" + "fmax z18.s, p4/M, z18.s, z0.s\n" + "fmax z19.s, p4/M, z19.s, z0.s\n" + "fmax z20.s, p4/M, z20.s, z0.s\n" + "fmax z21.s, p4/M, z21.s, z0.s\n" + "fmax z22.s, p4/M, z22.s, z0.s\n" + "fmax z23.s, p4/M, z23.s, z0.s\n" + "fmax z24.s, p4/M, z24.s, z0.s\n" + "fmax z25.s, p4/M, z25.s, z0.s\n" + "fmax z26.s, p4/M, z26.s, z0.s\n" + "fmax z27.s, p4/M, z27.s, z0.s\n" + "59:" // Height 5: No activation + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x24]\n" + "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x23]\n" + "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x22]\n" + "st1w { z21.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x21]\n" + "st1w { z25.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x21, #3, MUL VL]\n" + "60:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 50b\n" + "b 74f\n" + "61:" // Height 6 + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "62:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "cbz x11, 63f\n" + "ld1w { z8.s }, p4/Z, [x11]\n" + "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n" + "mov z12.d, z8.d\n" + "mov z13.d, z9.d\n" + "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "addvl x11, x11, #4\n" + "mov z16.d, z8.d\n" + "mov z17.d, z9.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z20.d, z8.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "mov z28.d, z8.d\n" + "mov z29.d, z9.d\n" + "mov z30.d, z10.d\n" + "mov z31.d, z11.d\n" + "b 65f\n" + "63:" // Height 6: no bias + "tbz %x[flags], #0, 64f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x24]\n" + "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23]\n" + "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x22]\n" + "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x21]\n" + "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p3/Z, [x20]\n" + "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n" + "b 65f\n" + "64:" // Height 6: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "65:" // Height 6: setup done + "mov x27, #0x0\n" + "66:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 67f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 68f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 68f\n" + "67:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "68:" // Height 6: input setup done + "subs x26, x26, #0x1\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "ble 70f\n" + "69:" // Height 6: Multiply loop: Main loop + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z20.s, p4/M, z6.s, z3.s\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "fmla z24.s, p4/M, z6.s, z4.s\n" + "fmla z28.s, p4/M, z6.s, z5.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x4\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "add x21, x21, #0x4\n" + "add x20, x20, #0x4\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "fmla z21.s, p4/M, z7.s, z3.s\n" + "fmla z25.s, p4/M, z7.s, z4.s\n" + "fmla z29.s, p4/M, z7.s, z5.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z22.s, p4/M, z6.s, z3.s\n" + "fmla z26.s, p4/M, z6.s, z4.s\n" + "fmla z30.s, p4/M, z6.s, z5.s\n" + "ld1w { z6.s }, p4/Z, [x9]\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z23.s, p4/M, z7.s, z3.s\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "fmla z27.s, p4/M, z7.s, z4.s\n" + "fmla z31.s, p4/M, z7.s, z5.s\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 69b\n" + "70:" // Height 6: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "fmla z8.s, p4/M, z6.s, z0.s\n" + "fmla z12.s, p4/M, z6.s, z1.s\n" + "add x27, x27, #0x1\n" + "fmla z16.s, p4/M, z6.s, z2.s\n" + "fmla z20.s, p4/M, z6.s, z3.s\n" + "cmp x27, x19\n" + "fmla z24.s, p4/M, z6.s, z4.s\n" + "fmla z28.s, p4/M, z6.s, z5.s\n" + "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n" + "fmla z9.s, p4/M, z7.s, z0.s\n" + "fmla z13.s, p4/M, z7.s, z1.s\n" + "fmla z17.s, p4/M, z7.s, z2.s\n" + "fmla z21.s, p4/M, z7.s, z3.s\n" + "fmla z25.s, p4/M, z7.s, z4.s\n" + "fmla z29.s, p4/M, z7.s, z5.s\n" + "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "fmla z10.s, p4/M, z6.s, z0.s\n" + "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z18.s, p4/M, z6.s, z2.s\n" + "fmla z22.s, p4/M, z6.s, z3.s\n" + "fmla z26.s, p4/M, z6.s, z4.s\n" + "fmla z30.s, p4/M, z6.s, z5.s\n" + "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z23.s, p4/M, z7.s, z3.s\n" + "fmla z27.s, p4/M, z7.s, z4.s\n" + "fmla z31.s, p4/M, z7.s, z5.s\n" + "bne 66b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "tbz %x[flags], #1, 71f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p4/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p4/Z, [x19]\n" + "fmin z8.s, p4/M, z8.s, z1.s\n" + "fmin z9.s, p4/M, z9.s, z1.s\n" + "fmin z10.s, p4/M, z10.s, z1.s\n" + "fmin z11.s, p4/M, z11.s, z1.s\n" + "fmin z12.s, p4/M, z12.s, z1.s\n" + "fmin z13.s, p4/M, z13.s, z1.s\n" + "fmin z14.s, p4/M, z14.s, z1.s\n" + "fmin z15.s, p4/M, z15.s, z1.s\n" + "fmin z16.s, p4/M, z16.s, z1.s\n" + "fmin z17.s, p4/M, z17.s, z1.s\n" + "fmin z18.s, p4/M, z18.s, z1.s\n" + "fmin z19.s, p4/M, z19.s, z1.s\n" + "fmin z20.s, p4/M, z20.s, z1.s\n" + "fmin z21.s, p4/M, z21.s, z1.s\n" + "fmin z22.s, p4/M, z22.s, z1.s\n" + "fmin z23.s, p4/M, z23.s, z1.s\n" + "fmin z24.s, p4/M, z24.s, z1.s\n" + "fmin z25.s, p4/M, z25.s, z1.s\n" + "fmin z26.s, p4/M, z26.s, z1.s\n" + "fmin z27.s, p4/M, z27.s, z1.s\n" + "fmin z28.s, p4/M, z28.s, z1.s\n" + "fmin z29.s, p4/M, z29.s, z1.s\n" + "fmin z30.s, p4/M, z30.s, z1.s\n" + "fmin z31.s, p4/M, z31.s, z1.s\n" + "fmax z8.s, p4/M, z8.s, z0.s\n" + "fmax z9.s, p4/M, z9.s, z0.s\n" + "fmax z10.s, p4/M, z10.s, z0.s\n" + "fmax z11.s, p4/M, z11.s, z0.s\n" + "fmax z12.s, p4/M, z12.s, z0.s\n" + "fmax z13.s, p4/M, z13.s, z0.s\n" + "fmax z14.s, p4/M, z14.s, z0.s\n" + "fmax z15.s, p4/M, z15.s, z0.s\n" + "fmax z16.s, p4/M, z16.s, z0.s\n" + "fmax z17.s, p4/M, z17.s, z0.s\n" + "fmax z18.s, p4/M, z18.s, z0.s\n" + "fmax z19.s, p4/M, z19.s, z0.s\n" + "fmax z20.s, p4/M, z20.s, z0.s\n" + "fmax z21.s, p4/M, z21.s, z0.s\n" + "fmax z22.s, p4/M, z22.s, z0.s\n" + "fmax z23.s, p4/M, z23.s, z0.s\n" + "fmax z24.s, p4/M, z24.s, z0.s\n" + "fmax z25.s, p4/M, z25.s, z0.s\n" + "fmax z26.s, p4/M, z26.s, z0.s\n" + "fmax z27.s, p4/M, z27.s, z0.s\n" + "fmax z28.s, p4/M, z28.s, z0.s\n" + "fmax z29.s, p4/M, z29.s, z0.s\n" + "fmax z30.s, p4/M, z30.s, z0.s\n" + "fmax z31.s, p4/M, z31.s, z0.s\n" + "71:" // Height 6: No activation + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x24]\n" + "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x23]\n" + "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x22]\n" + "st1w { z21.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x21]\n" + "st1w { z25.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z28.s }, p3, [x20]\n" + "st1w { z29.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z30.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z31.s }, p0, [x20, #3, MUL VL]\n" + "72:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 62b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 74f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 73f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "73:" // Update direct input + "mov x19, #0x18\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "74:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp index 25d65826b9..3baf7b9715 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp @@ -161,13 +161,12 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x4\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "cmp x26, #0x4\n" + "add x25, x25, #0x10\n" "fmla z10.s, z6.s, z0.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z11.s, z7.s, z0.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.s, z6.s, z0.s[1]\n" @@ -202,7 +201,6 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -241,9 +239,8 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z10.s, z6.s, z0.s[3]\n" "fmla z11.s, z7.s, z0.s[3]\n" "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 6b\n" "tbz %x[flags], #1, 12f\n" @@ -347,16 +344,14 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x4\n" "fmla z9.s, z7.s, z0.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x4\n" "fmla z13.s, z7.s, z1.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" @@ -407,9 +402,7 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z13.s, z7.s, z1.s[0]\n" @@ -464,10 +457,8 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z11.s, z7.s, z0.s[3]\n" "fmla z15.s, z7.s, z1.s[3]\n" "24:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 19b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -601,21 +592,18 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x4\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x4\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z17.s, z7.s, z2.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" @@ -680,12 +668,9 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "add x23, x23, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" "fmla z16.s, z6.s, z2.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" @@ -755,11 +740,8 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z15.s, z7.s, z1.s[3]\n" "fmla z19.s, z7.s, z2.s[3]\n" "37:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -922,26 +904,22 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x4\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x4\n" + "add x22, x22, #0x10\n" + "fmla z17.s, z7.s, z2.s[0]\n" "fmla z20.s, z6.s, z3.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.s, z7.s, z3.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" @@ -1020,19 +998,15 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "add x22, x22, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z16.s, z6.s, z2.s[0]\n" "fmla z20.s, z6.s, z3.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z17.s, z7.s, z2.s[0]\n" "fmla z21.s, z7.s, z3.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -1113,12 +1087,8 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z19.s, z7.s, z2.s[3]\n" "fmla z23.s, z7.s, z3.s[3]\n" "50:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 45b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1310,32 +1280,27 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x4\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla z17.s, z7.s, z2.s[0]\n" "add x21, x21, #0x10\n" "fmla z20.s, z6.s, z3.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x4\n" "fmla z24.s, z6.s, z4.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.s, z7.s, z3.s[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.s, z7.s, z4.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.s, z6.s, z0.s[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" "fmla z22.s, z6.s, z3.s[0]\n" @@ -1427,22 +1392,17 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z16.s, z6.s, z2.s[0]\n" + "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" - "add x21, x21, #0x10\n" - "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z16.s, z6.s, z2.s[0]\n" "fmla z20.s, z6.s, z3.s[0]\n" "fmla z24.s, z6.s, z4.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z17.s, z7.s, z2.s[0]\n" "fmla z21.s, z7.s, z3.s[0]\n" "fmla z25.s, z7.s, z4.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" @@ -1538,13 +1498,8 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z23.s, z7.s, z3.s[3]\n" "fmla z27.s, z7.s, z4.s[3]\n" "63:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 58b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1768,37 +1723,31 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x4\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "fmla z20.s, z6.s, z3.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "fmla z17.s, z7.s, z2.s[0]\n" "add x20, x20, #0x10\n" "fmla z24.s, z6.s, z4.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x4\n" "fmla z28.s, z6.s, z5.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.s, z7.s, z3.s[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.s, z7.s, z4.s[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z29.s, z7.s, z5.s[0]\n" "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.s, z6.s, z0.s[0]\n" - "prfm pldl1keep, [x20, #0x80]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" "fmla z22.s, z6.s, z3.s[0]\n" @@ -1904,25 +1853,19 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z16.s, z6.s, z2.s[0]\n" + "fmla z12.s, z6.s, z1.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" "fmla z20.s, z6.s, z3.s[0]\n" - "add x20, x20, #0x10\n" - "fmla z17.s, z7.s, z2.s[0]\n" "fmla z24.s, z6.s, z4.s[0]\n" "fmla z28.s, z6.s, z5.s[0]\n" "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z17.s, z7.s, z2.s[0]\n" "fmla z21.s, z7.s, z3.s[0]\n" "fmla z25.s, z7.s, z4.s[0]\n" "fmla z29.s, z7.s, z5.s[0]\n" @@ -2033,14 +1976,8 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z27.s, z7.s, z4.s[3]\n" "fmla z31.s, z7.s, z5.s[3]\n" "76:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 71b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp index 87f063d224..c0718b1e75 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp @@ -22,8 +22,8 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" #define ARGLIST \ @@ -38,11 +38,13 @@ namespace arm_gemm { // Actual kernel implementations void sve_hybrid_fp32_mla_8x1VL( ARGLIST ); +void sve_hybrid_fp32_mla_8x1VL_a64fx( ARGLIST ); class cls_sve_hybrid_fp32_mla_8x1VL { public: - typedef float operand_type; + typedef float lhs_operand_type; + typedef float rhs_operand_type; typedef float result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,16 +70,24 @@ public: return true; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; // Default to the generic kernel kern_type kernel=sve_hybrid_fp32_mla_8x1VL; - cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *) + cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_hybrid_fp32_mla_8x1VL_a64fx; + break; + } } }; } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp new file mode 100644 index 0000000000..0a37f8abfc --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp @@ -0,0 +1,1143 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_fp32_mla_8x1VL_a64fx ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p1.b\n" + "1:" // Row loop + "cmp %x[M], #0x8\n" + "bge 85f\n" + "cmp %x[M], #0x6\n" + "bgt 73f\n" + "beq 61f\n" + "cmp %x[M], #0x4\n" + "bgt 49f\n" + "beq 37f\n" + "cmp %x[M], #0x2\n" + "bgt 25f\n" + "beq 13f\n" + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 3f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "addvl x13, x13, #1\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z24.b, #0x0\n" + "5:" // Height 1: setup done + "mov x9, #0x0\n" + "6:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "cbnz x9, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x27, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "addvl x11, x11, #1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "addvl x11, x11, #1\n" + "bne 6b\n" + "tbz %x[flags], #1, 11f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "11:" // Height 1: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "12:" // Height 1: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 2b\n" + "b 98f\n" + "13:" // Height 2 + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "14:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 15f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "mov z25.d, z24.d\n" + "addvl x13, x13, #1\n" + "b 17f\n" + "15:" // Height 2: no bias + "tbz %x[flags], #0, 16f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "ld1w { z25.s }, p0/Z, [x26]\n" + "b 17f\n" + "16:" // Height 2: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "17:" // Height 2: setup done + "mov x9, #0x0\n" + "18:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "cbnz x9, 20f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "20:" // Height 2: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ble 22f\n" + "21:" // Height 2: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "add x26, x26, #0x4\n" + "fmla z25.s, p1/M, z8.s, z1.s\n" + "addvl x11, x11, #1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "bgt 21b\n" + "22:" // Height 2: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z25.s, p1/M, z9.s, z1.s\n" + "addvl x11, x11, #1\n" + "bne 18b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "tbz %x[flags], #1, 23f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmin z25.s, p1/M, z25.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "fmax z25.s, p1/M, z25.s, z16.s\n" + "23:" // Height 2: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p0, [x26]\n" + "24:" // Height 2: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 14b\n" + "b 98f\n" + "25:" // Height 3 + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "26:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 27f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "addvl x13, x13, #1\n" + "b 29f\n" + "27:" // Height 3: no bias + "tbz %x[flags], #0, 28f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "ld1w { z25.s }, p0/Z, [x26]\n" + "ld1w { z26.s }, p0/Z, [x25]\n" + "b 29f\n" + "28:" // Height 3: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "29:" // Height 3: setup done + "mov x9, #0x0\n" + "30:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "cbnz x9, 32f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 32f\n" + "31:" // Height 3: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "32:" // Height 3: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ble 34f\n" + "33:" // Height 3: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "add x26, x26, #0x4\n" + "add x25, x25, #0x4\n" + "fmla z25.s, p1/M, z8.s, z1.s\n" + "fmla z26.s, p1/M, z8.s, z2.s\n" + "addvl x11, x11, #1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "bgt 33b\n" + "34:" // Height 3: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z25.s, p1/M, z9.s, z1.s\n" + "addvl x11, x11, #1\n" + "fmla z26.s, p1/M, z9.s, z2.s\n" + "bne 30b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "tbz %x[flags], #1, 35f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmin z25.s, p1/M, z25.s, z17.s\n" + "fmin z26.s, p1/M, z26.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "fmax z25.s, p1/M, z25.s, z16.s\n" + "fmax z26.s, p1/M, z26.s, z16.s\n" + "35:" // Height 3: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x25]\n" + "36:" // Height 3: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 26b\n" + "b 98f\n" + "37:" // Height 4 + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "38:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 39f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "addvl x13, x13, #1\n" + "mov z27.d, z24.d\n" + "b 41f\n" + "39:" // Height 4: no bias + "tbz %x[flags], #0, 40f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "ld1w { z25.s }, p0/Z, [x26]\n" + "ld1w { z26.s }, p0/Z, [x25]\n" + "ld1w { z27.s }, p0/Z, [x24]\n" + "b 41f\n" + "40:" // Height 4: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "41:" // Height 4: setup done + "mov x9, #0x0\n" + "42:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 43f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x9, 44f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 44f\n" + "43:" // Height 4: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "44:" // Height 4: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ble 46f\n" + "45:" // Height 4: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "add x26, x26, #0x4\n" + "add x25, x25, #0x4\n" + "fmla z25.s, p1/M, z8.s, z1.s\n" + "fmla z26.s, p1/M, z8.s, z2.s\n" + "add x24, x24, #0x4\n" + "fmla z27.s, p1/M, z8.s, z3.s\n" + "addvl x11, x11, #1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "bgt 45b\n" + "46:" // Height 4: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z25.s, p1/M, z9.s, z1.s\n" + "addvl x11, x11, #1\n" + "fmla z26.s, p1/M, z9.s, z2.s\n" + "fmla z27.s, p1/M, z9.s, z3.s\n" + "bne 42b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "tbz %x[flags], #1, 47f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmin z25.s, p1/M, z25.s, z17.s\n" + "fmin z26.s, p1/M, z26.s, z17.s\n" + "fmin z27.s, p1/M, z27.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "fmax z25.s, p1/M, z25.s, z16.s\n" + "fmax z26.s, p1/M, z26.s, z16.s\n" + "fmax z27.s, p1/M, z27.s, z16.s\n" + "47:" // Height 4: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x25]\n" + "st1w { z27.s }, p0, [x24]\n" + "48:" // Height 4: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 38b\n" + "b 98f\n" + "49:" // Height 5 + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "50:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 51f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "addvl x13, x13, #1\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "b 53f\n" + "51:" // Height 5: no bias + "tbz %x[flags], #0, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "ld1w { z25.s }, p0/Z, [x26]\n" + "ld1w { z26.s }, p0/Z, [x25]\n" + "ld1w { z27.s }, p0/Z, [x24]\n" + "ld1w { z28.s }, p0/Z, [x23]\n" + "b 53f\n" + "52:" // Height 5: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "53:" // Height 5: setup done + "mov x9, #0x0\n" + "54:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 55f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "cbnz x9, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 56f\n" + "55:" // Height 5: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "56:" // Height 5: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "ble 58f\n" + "57:" // Height 5: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "add x26, x26, #0x4\n" + "add x25, x25, #0x4\n" + "fmla z25.s, p1/M, z8.s, z1.s\n" + "fmla z26.s, p1/M, z8.s, z2.s\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "fmla z27.s, p1/M, z8.s, z3.s\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "addvl x11, x11, #1\n" + "fmla z28.s, p1/M, z8.s, z4.s\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "bgt 57b\n" + "58:" // Height 5: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z25.s, p1/M, z9.s, z1.s\n" + "addvl x11, x11, #1\n" + "fmla z26.s, p1/M, z9.s, z2.s\n" + "fmla z27.s, p1/M, z9.s, z3.s\n" + "fmla z28.s, p1/M, z9.s, z4.s\n" + "bne 54b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "tbz %x[flags], #1, 59f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmin z25.s, p1/M, z25.s, z17.s\n" + "fmin z26.s, p1/M, z26.s, z17.s\n" + "fmin z27.s, p1/M, z27.s, z17.s\n" + "fmin z28.s, p1/M, z28.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "fmax z25.s, p1/M, z25.s, z16.s\n" + "fmax z26.s, p1/M, z26.s, z16.s\n" + "fmax z27.s, p1/M, z27.s, z16.s\n" + "fmax z28.s, p1/M, z28.s, z16.s\n" + "59:" // Height 5: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x25]\n" + "st1w { z27.s }, p0, [x24]\n" + "st1w { z28.s }, p0, [x23]\n" + "60:" // Height 5: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 50b\n" + "b 98f\n" + "61:" // Height 6 + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "62:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 63f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "addvl x13, x13, #1\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "b 65f\n" + "63:" // Height 6: no bias + "tbz %x[flags], #0, 64f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "ld1w { z25.s }, p0/Z, [x26]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z26.s }, p0/Z, [x25]\n" + "ld1w { z27.s }, p0/Z, [x24]\n" + "ld1w { z28.s }, p0/Z, [x23]\n" + "ld1w { z29.s }, p0/Z, [x22]\n" + "b 65f\n" + "64:" // Height 6: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "65:" // Height 6: setup done + "mov x9, #0x0\n" + "66:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 67f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "cbnz x9, 68f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 68f\n" + "67:" // Height 6: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "68:" // Height 6: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "ld1rw { z5.s }, p1/Z, [x22]\n" + "ble 70f\n" + "69:" // Height 6: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "add x26, x26, #0x4\n" + "add x25, x25, #0x4\n" + "fmla z25.s, p1/M, z8.s, z1.s\n" + "fmla z26.s, p1/M, z8.s, z2.s\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "fmla z27.s, p1/M, z8.s, z3.s\n" + "fmla z28.s, p1/M, z8.s, z4.s\n" + "add x22, x22, #0x4\n" + "addvl x11, x11, #1\n" + "fmla z29.s, p1/M, z8.s, z5.s\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "ld1rw { z5.s }, p1/Z, [x22]\n" + "bgt 69b\n" + "70:" // Height 6: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z25.s, p1/M, z9.s, z1.s\n" + "addvl x11, x11, #1\n" + "fmla z26.s, p1/M, z9.s, z2.s\n" + "fmla z27.s, p1/M, z9.s, z3.s\n" + "fmla z28.s, p1/M, z9.s, z4.s\n" + "fmla z29.s, p1/M, z9.s, z5.s\n" + "bne 66b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "tbz %x[flags], #1, 71f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmin z25.s, p1/M, z25.s, z17.s\n" + "fmin z26.s, p1/M, z26.s, z17.s\n" + "fmin z27.s, p1/M, z27.s, z17.s\n" + "fmin z28.s, p1/M, z28.s, z17.s\n" + "fmin z29.s, p1/M, z29.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "fmax z25.s, p1/M, z25.s, z16.s\n" + "fmax z26.s, p1/M, z26.s, z16.s\n" + "fmax z27.s, p1/M, z27.s, z16.s\n" + "fmax z28.s, p1/M, z28.s, z16.s\n" + "fmax z29.s, p1/M, z29.s, z16.s\n" + "71:" // Height 6: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x25]\n" + "st1w { z27.s }, p0, [x24]\n" + "st1w { z28.s }, p0, [x23]\n" + "st1w { z29.s }, p0, [x22]\n" + "72:" // Height 6: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 62b\n" + "b 98f\n" + "73:" // Height 7 + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "74:" // Height 7: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 75f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "addvl x13, x13, #1\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "b 77f\n" + "75:" // Height 7: no bias + "tbz %x[flags], #0, 76f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "ld1w { z25.s }, p0/Z, [x26]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z26.s }, p0/Z, [x25]\n" + "ld1w { z27.s }, p0/Z, [x24]\n" + "ld1w { z28.s }, p0/Z, [x23]\n" + "ld1w { z29.s }, p0/Z, [x22]\n" + "ld1w { z30.s }, p0/Z, [x21]\n" + "b 77f\n" + "76:" // Height 7: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "77:" // Height 7: setup done + "mov x9, #0x0\n" + "78:" // Height 7: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 79f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "ldr x21, [x20, #0x30]\n" + "cbnz x9, 80f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 80f\n" + "79:" // Height 7: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "80:" // Height 7: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "ld1rw { z5.s }, p1/Z, [x22]\n" + "ld1rw { z6.s }, p1/Z, [x21]\n" + "ble 82f\n" + "81:" // Height 7: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "add x26, x26, #0x4\n" + "add x25, x25, #0x4\n" + "fmla z25.s, p1/M, z8.s, z1.s\n" + "fmla z26.s, p1/M, z8.s, z2.s\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "fmla z27.s, p1/M, z8.s, z3.s\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "add x22, x22, #0x4\n" + "add x21, x21, #0x4\n" + "fmla z28.s, p1/M, z8.s, z4.s\n" + "fmla z29.s, p1/M, z8.s, z5.s\n" + "addvl x11, x11, #1\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "fmla z30.s, p1/M, z8.s, z6.s\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "ld1rw { z5.s }, p1/Z, [x22]\n" + "ld1rw { z6.s }, p1/Z, [x21]\n" + "bgt 81b\n" + "82:" // Height 7: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z25.s, p1/M, z9.s, z1.s\n" + "addvl x11, x11, #1\n" + "fmla z26.s, p1/M, z9.s, z2.s\n" + "fmla z27.s, p1/M, z9.s, z3.s\n" + "fmla z28.s, p1/M, z9.s, z4.s\n" + "fmla z29.s, p1/M, z9.s, z5.s\n" + "fmla z30.s, p1/M, z9.s, z6.s\n" + "bne 78b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "tbz %x[flags], #1, 83f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmin z25.s, p1/M, z25.s, z17.s\n" + "fmin z26.s, p1/M, z26.s, z17.s\n" + "fmin z27.s, p1/M, z27.s, z17.s\n" + "fmin z28.s, p1/M, z28.s, z17.s\n" + "fmin z29.s, p1/M, z29.s, z17.s\n" + "fmin z30.s, p1/M, z30.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "fmax z25.s, p1/M, z25.s, z16.s\n" + "fmax z26.s, p1/M, z26.s, z16.s\n" + "fmax z27.s, p1/M, z27.s, z16.s\n" + "fmax z28.s, p1/M, z28.s, z16.s\n" + "fmax z29.s, p1/M, z29.s, z16.s\n" + "fmax z30.s, p1/M, z30.s, z16.s\n" + "83:" // Height 7: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x25]\n" + "st1w { z27.s }, p0, [x24]\n" + "st1w { z28.s }, p0, [x23]\n" + "st1w { z29.s }, p0, [x22]\n" + "st1w { z30.s }, p0, [x21]\n" + "84:" // Height 7: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 74b\n" + "b 98f\n" + "85:" // Height 8 + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x20\n" + "mov x13, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "86:" // Height 8: Column loop + "mov x19, #0x0\n" + "whilelt p0.s, x19, x12\n" + "cbz x13, 87f\n" + "ld1w { z24.s }, p1/Z, [x13]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "addvl x13, x13, #1\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "b 89f\n" + "87:" // Height 8: no bias + "tbz %x[flags], #0, 88f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z24.s }, p0/Z, [x10]\n" + "ld1w { z25.s }, p0/Z, [x26]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z26.s }, p0/Z, [x25]\n" + "ld1w { z27.s }, p0/Z, [x24]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z28.s }, p0/Z, [x23]\n" + "ld1w { z29.s }, p0/Z, [x22]\n" + "ld1w { z30.s }, p0/Z, [x21]\n" + "ld1w { z31.s }, p0/Z, [x20]\n" + "b 89f\n" + "88:" // Height 8: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "89:" // Height 8: setup done + "mov x9, #0x0\n" + "90:" // Height 8: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w28, [x19, x9, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 91f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "ldr x21, [x20, #0x30]\n" + "ldr x20, [x20, #0x38]\n" + "cbnz x9, 92f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 92f\n" + "91:" // Height 8: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "92:" // Height 8: input setup done + "subs x28, x28, #0x1\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "ld1rw { z5.s }, p1/Z, [x22]\n" + "ld1rw { z6.s }, p1/Z, [x21]\n" + "ld1rw { z7.s }, p1/Z, [x20]\n" + "ble 94f\n" + "93:" // Height 8: Multiply loop: Main loop + "ld1w { z8.s }, p1/Z, [x11]\n" + "add x27, x27, #0x4\n" + "subs x28, x28, #0x1\n" + "fmla z24.s, p1/M, z8.s, z0.s\n" + "add x26, x26, #0x4\n" + "add x25, x25, #0x4\n" + "fmla z25.s, p1/M, z8.s, z1.s\n" + "fmla z26.s, p1/M, z8.s, z2.s\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "fmla z27.s, p1/M, z8.s, z3.s\n" + "fmla z28.s, p1/M, z8.s, z4.s\n" + "add x22, x22, #0x4\n" + "add x21, x21, #0x4\n" + "fmla z29.s, p1/M, z8.s, z5.s\n" + "ld1rw { z0.s }, p1/Z, [x27]\n" + "add x20, x20, #0x4\n" + "addvl x11, x11, #1\n" + "ld1rw { z1.s }, p1/Z, [x26]\n" + "fmla z30.s, p1/M, z8.s, z6.s\n" + "fmla z31.s, p1/M, z8.s, z7.s\n" + "ld1rw { z2.s }, p1/Z, [x25]\n" + "ld1rw { z3.s }, p1/Z, [x24]\n" + "ld1rw { z4.s }, p1/Z, [x23]\n" + "ld1rw { z5.s }, p1/Z, [x22]\n" + "ld1rw { z6.s }, p1/Z, [x21]\n" + "ld1rw { z7.s }, p1/Z, [x20]\n" + "bgt 93b\n" + "94:" // Height 8: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "ld1w { z9.s }, p1/Z, [x11]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z25.s, p1/M, z9.s, z1.s\n" + "addvl x11, x11, #1\n" + "fmla z26.s, p1/M, z9.s, z2.s\n" + "fmla z27.s, p1/M, z9.s, z3.s\n" + "fmla z28.s, p1/M, z9.s, z4.s\n" + "fmla z29.s, p1/M, z9.s, z5.s\n" + "fmla z30.s, p1/M, z9.s, z6.s\n" + "fmla z31.s, p1/M, z9.s, z7.s\n" + "bne 90b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "tbz %x[flags], #1, 95f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z17.s }, p1/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z16.s }, p1/Z, [x19]\n" + "fmin z24.s, p1/M, z24.s, z17.s\n" + "fmin z25.s, p1/M, z25.s, z17.s\n" + "fmin z26.s, p1/M, z26.s, z17.s\n" + "fmin z27.s, p1/M, z27.s, z17.s\n" + "fmin z28.s, p1/M, z28.s, z17.s\n" + "fmin z29.s, p1/M, z29.s, z17.s\n" + "fmin z30.s, p1/M, z30.s, z17.s\n" + "fmin z31.s, p1/M, z31.s, z17.s\n" + "fmax z24.s, p1/M, z24.s, z16.s\n" + "fmax z25.s, p1/M, z25.s, z16.s\n" + "fmax z26.s, p1/M, z26.s, z16.s\n" + "fmax z27.s, p1/M, z27.s, z16.s\n" + "fmax z28.s, p1/M, z28.s, z16.s\n" + "fmax z29.s, p1/M, z29.s, z16.s\n" + "fmax z30.s, p1/M, z30.s, z16.s\n" + "fmax z31.s, p1/M, z31.s, z16.s\n" + "95:" // Height 8: No activation + "st1w { z24.s }, p0, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x25]\n" + "st1w { z27.s }, p0, [x24]\n" + "st1w { z28.s }, p0, [x23]\n" + "st1w { z29.s }, p0, [x22]\n" + "st1w { z30.s }, p0, [x21]\n" + "st1w { z31.s }, p0, [x20]\n" + "96:" // Height 8: Writeback done + "decw x12\n" + "cmp x12, XZR\n" + "bgt 86b\n" + "subs %x[M], %x[M], #0x8\n" + "beq 98f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 97f\n" + "add x20, x20, #0x8\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "97:" // Update direct input + "mov x19, #0x20\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "98:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp index 943e0ac148..5b4b6b9b2e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp @@ -149,12 +149,11 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x27, x27, #0x10\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z11.s, z0.s[3]\n" "bgt 9b\n" @@ -164,7 +163,6 @@ void sve_hybrid_fp32_mla_8x1VL ( "subs x28, x28, #0x1\n" "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "add x27, x27, #0x10\n" "addvl x12, x12, #1\n" "ble 11f\n" "ld1w { z9.s }, p2/Z, [x12]\n" @@ -181,9 +179,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z24.s, z11.s, z0.s[3]\n" "addvl x12, x12, #1\n" "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 6b\n" "tbz %x[flags], #1, 12f\n" @@ -254,18 +251,16 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x26, x26, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" "fmla z25.s, z11.s, z1.s[3]\n" @@ -277,16 +272,14 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "add x26, x26, #0x10\n" "addvl x12, x12, #1\n" + "fmla z25.s, z8.s, z1.s[0]\n" "ble 24f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" "subs x28, x28, #0x1\n" - "fmla z25.s, z9.s, z1.s[1]\n" "addvl x12, x12, #1\n" + "fmla z25.s, z9.s, z1.s[1]\n" "ble 24f\n" "ld1w { z10.s }, p2/Z, [x12]\n" "fmla z24.s, z10.s, z0.s[2]\n" @@ -299,10 +292,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "addvl x12, x12, #1\n" "fmla z25.s, z11.s, z1.s[3]\n" "24:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 19b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -385,22 +376,19 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x25, x25, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z26.s, z9.s, z2.s[1]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" @@ -415,13 +403,10 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "add x25, x25, #0x10\n" - "addvl x12, x12, #1\n" "ble 37f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -443,11 +428,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z25.s, z11.s, z1.s[3]\n" "fmla z26.s, z11.s, z2.s[3]\n" "37:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -541,27 +523,23 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" @@ -578,16 +556,12 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" "fmla z27.s, z8.s, z3.s[0]\n" - "add x24, x24, #0x10\n" - "addvl x12, x12, #1\n" "ble 50f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -612,12 +586,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z26.s, z11.s, z2.s[3]\n" "fmla z27.s, z11.s, z3.s[3]\n" "50:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 45b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -722,33 +692,28 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -766,19 +731,14 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" "fmla z28.s, z8.s, z4.s[0]\n" - "add x23, x23, #0x10\n" - "addvl x12, x12, #1\n" "ble 63f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -806,13 +766,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z27.s, z11.s, z3.s[3]\n" "fmla z28.s, z11.s, z4.s[3]\n" "63:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 58b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -928,38 +883,32 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x22, x22, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -979,22 +928,16 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" "fmla z29.s, z8.s, z5.s[0]\n" - "add x22, x22, #0x10\n" - "addvl x12, x12, #1\n" "ble 76f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -1025,14 +968,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z28.s, z11.s, z4.s[3]\n" "fmla z29.s, z11.s, z5.s[3]\n" "76:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 71b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1159,43 +1096,36 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x21, x21, #0x10\n" "fmla z30.s, z8.s, z6.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z30.s, z9.s, z6.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -1217,25 +1147,18 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z29.s, z8.s, z5.s[0]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z29.s, z8.s, z5.s[0]\n" "fmla z30.s, z8.s, z6.s[0]\n" - "add x21, x21, #0x10\n" - "addvl x12, x12, #1\n" "ble 89f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -1269,15 +1192,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z29.s, z11.s, z5.s[3]\n" "fmla z30.s, z11.s, z6.s[3]\n" "89:" // Height 7: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 84b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1418,48 +1334,40 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" "ld1rqw { z7.s }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x20, x20, #0x10\n" + "add x21, x21, #0x10\n" "fmla z30.s, z8.s, z6.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x20, x20, #0x10\n" "fmla z31.s, z8.s, z7.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z30.s, z9.s, z6.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z31.s, z9.s, z7.s[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x20, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -1483,28 +1391,20 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z29.s, z8.s, z5.s[0]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" - "fmla z30.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z4.s[0]\n" "ld1rqw { z7.s }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "fmla z30.s, z8.s, z6.s[0]\n" "fmla z31.s, z8.s, z7.s[0]\n" - "add x20, x20, #0x10\n" - "addvl x12, x12, #1\n" "ble 102f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -1541,16 +1441,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z30.s, z11.s, z6.s[3]\n" "fmla z31.s, z11.s, z7.s[3]\n" "102:" // Height 8: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 97b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp new file mode 100644 index 0000000000..2142f1067d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_fp32bf16fp32_mmla_4x6VL( ARGLIST ); + +class cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL +{ +public: + typedef float lhs_operand_type; + typedef bfloat16 rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return get_vector_length() * 6; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 16.63 }; + case CPUModel::A510: + return { 5.42 }; + case CPUModel::V1: + return { 28.40 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32bf16fp32_mmla_4x6VL; + cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp new file mode 100644 index 0000000000..43b0f54805 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp @@ -0,0 +1,1306 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p7.b\n" + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 40f\n" + "cmp %x[M], #0x2\n" + "bgt 27f\n" + "beq 14f\n" + "mov x9, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p6.s, x19, x28\n" + "incw x19\n" + "whilelt p5.s, x19, x28\n" + "incw x19\n" + "whilelt p4.s, x19, x28\n" + "incw x19\n" + "whilelt p3.s, x19, x28\n" + "incw x19\n" + "whilelt p2.s, x19, x28\n" + "incw x19\n" + "whilelt p1.s, x19, x28\n" + "cbz x9, 3f\n" + "ld1w { z8.s }, p7/Z, [x9]\n" + "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n" + "zip2 z14.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n" + "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n" + "zip2 z15.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n" + "zip2 z16.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "addvl x9, x9, #6\n" + "zip2 z17.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "zip2 z18.d, z12.d, z12.d\n" + "zip1 z12.d, z12.d, z12.d\n" + "zip2 z19.d, z13.d, z13.d\n" + "zip1 z13.d, z13.d, z13.d\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z9.s }, p6/Z, [x26]\n" + "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z14.d\n" + "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n" + "zip2 z14.d, z9.d, z14.d\n" + "zip1 z9.d, z10.d, z15.d\n" + "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n" + "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n" + "zip2 z15.d, z10.d, z15.d\n" + "zip1 z10.d, z11.d, z16.d\n" + "zip2 z16.d, z11.d, z16.d\n" + "zip1 z11.d, z12.d, z17.d\n" + "zip2 z17.d, z12.d, z17.d\n" + "zip1 z12.d, z13.d, z18.d\n" + "zip2 z18.d, z13.d, z18.d\n" + "zip1 z13.d, z20.d, z19.d\n" + "zip2 z19.d, z20.d, z19.d\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "5:" // Height 1: setup done + "mov x25, #0x0\n" + "6:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w24, [x19, x25, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x24, #0x4\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + "sub x24, x24, #0x4\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + "cmp x24, #0x4\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + "add x23, x23, #0x10\n" + "addvl x27, x27, #-4\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + "addvl x27, x27, #-4\n" + "11:" // Height 1: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 6b\n" + "uzp1 z8.d, z8.d, z14.d\n" + "uzp1 z9.d, z9.d, z15.d\n" + "uzp1 z10.d, z10.d, z16.d\n" + "uzp1 z11.d, z11.d, z17.d\n" + "uzp1 z12.d, z12.d, z18.d\n" + "uzp1 z13.d, z13.d, z19.d\n" + "tbz %x[flags], #1, 12f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p7/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p7/Z, [x19]\n" + "fmin z8.s, p7/M, z8.s, z1.s\n" + "fmin z9.s, p7/M, z9.s, z1.s\n" + "fmin z10.s, p7/M, z10.s, z1.s\n" + "fmin z11.s, p7/M, z11.s, z1.s\n" + "fmin z12.s, p7/M, z12.s, z1.s\n" + "fmin z13.s, p7/M, z13.s, z1.s\n" + "fmax z8.s, p7/M, z8.s, z0.s\n" + "fmax z9.s, p7/M, z9.s, z0.s\n" + "fmax z10.s, p7/M, z10.s, z0.s\n" + "fmax z11.s, p7/M, z11.s, z0.s\n" + "fmax z12.s, p7/M, z12.s, z0.s\n" + "fmax z13.s, p7/M, z13.s, z0.s\n" + "12:" // Height 1: No activation + "st1w { z8.s }, p6, [x26]\n" + "st1w { z9.s }, p5, [x26, #1, MUL VL]\n" + "st1w { z10.s }, p4, [x26, #2, MUL VL]\n" + "st1w { z11.s }, p3, [x26, #3, MUL VL]\n" + "st1w { z12.s }, p2, [x26, #4, MUL VL]\n" + "st1w { z13.s }, p1, [x26, #5, MUL VL]\n" + "addvl x26, x26, #6\n" + "13:" // Height 1: Writeback done + "decw x28, ALL, MUL #6\n" + "cmp x28, XZR\n" + "bgt 2b\n" + "b 54f\n" + "14:" // Height 2 + "mov x9, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "15:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p6.s, x19, x28\n" + "incw x19\n" + "whilelt p5.s, x19, x28\n" + "incw x19\n" + "whilelt p4.s, x19, x28\n" + "incw x19\n" + "whilelt p3.s, x19, x28\n" + "incw x19\n" + "whilelt p2.s, x19, x28\n" + "incw x19\n" + "whilelt p1.s, x19, x28\n" + "cbz x9, 16f\n" + "ld1w { z8.s }, p7/Z, [x9]\n" + "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n" + "zip2 z14.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n" + "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n" + "zip2 z15.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n" + "zip2 z16.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "addvl x9, x9, #6\n" + "zip2 z17.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "zip2 z18.d, z12.d, z12.d\n" + "zip1 z12.d, z12.d, z12.d\n" + "zip2 z19.d, z13.d, z13.d\n" + "zip1 z13.d, z13.d, z13.d\n" + "b 18f\n" + "16:" // Height 2: no bias + "tbz %x[flags], #0, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19, LSL #2\n" + "ld1w { z9.s }, p6/Z, [x26]\n" + "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n" + "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n" + "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n" + "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n" + "ld1w { z14.s }, p6/Z, [x22]\n" + "zip1 z8.d, z9.d, z14.d\n" + "zip2 z14.d, z9.d, z14.d\n" + "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z10.d, z15.d\n" + "zip2 z15.d, z10.d, z15.d\n" + "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n" + "zip1 z10.d, z11.d, z16.d\n" + "zip2 z16.d, z11.d, z16.d\n" + "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n" + "zip1 z11.d, z12.d, z17.d\n" + "zip2 z17.d, z12.d, z17.d\n" + "zip1 z12.d, z13.d, z18.d\n" + "zip2 z18.d, z13.d, z18.d\n" + "zip1 z13.d, z20.d, z19.d\n" + "zip2 z19.d, z20.d, z19.d\n" + "b 18f\n" + "17:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "18:" // Height 2: setup done + "mov x25, #0x0\n" + "19:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w24, [x19, x25, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 21f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "21:" // Height 2: input setup done + "cmp x24, #0x4\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x22]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "trn1 z0.d, z0.d, z1.d\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + "sub x24, x24, #0x4\n" + "cmp x24, #0x4\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + "addvl x27, x27, #-4\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x22]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "trn1 z0.d, z0.d, z1.d\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + "addvl x27, x27, #-4\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + "24:" // Height 2: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 19b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp1 z4.d, z8.d, z14.d\n" + "uzp2 z8.d, z8.d, z14.d\n" + "add x22, x26, x19, LSL #2\n" + "uzp1 z14.d, z9.d, z15.d\n" + "uzp2 z9.d, z9.d, z15.d\n" + "uzp1 z15.d, z10.d, z16.d\n" + "uzp2 z10.d, z10.d, z16.d\n" + "uzp1 z16.d, z11.d, z17.d\n" + "uzp2 z11.d, z11.d, z17.d\n" + "uzp1 z17.d, z12.d, z18.d\n" + "uzp2 z12.d, z12.d, z18.d\n" + "uzp1 z18.d, z13.d, z19.d\n" + "uzp2 z13.d, z13.d, z19.d\n" + "tbz %x[flags], #1, 25f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p7/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p7/Z, [x19]\n" + "fmin z4.s, p7/M, z4.s, z1.s\n" + "fmin z14.s, p7/M, z14.s, z1.s\n" + "fmin z15.s, p7/M, z15.s, z1.s\n" + "fmin z16.s, p7/M, z16.s, z1.s\n" + "fmin z17.s, p7/M, z17.s, z1.s\n" + "fmin z18.s, p7/M, z18.s, z1.s\n" + "fmin z8.s, p7/M, z8.s, z1.s\n" + "fmin z9.s, p7/M, z9.s, z1.s\n" + "fmin z10.s, p7/M, z10.s, z1.s\n" + "fmin z11.s, p7/M, z11.s, z1.s\n" + "fmin z12.s, p7/M, z12.s, z1.s\n" + "fmin z13.s, p7/M, z13.s, z1.s\n" + "fmax z4.s, p7/M, z4.s, z0.s\n" + "fmax z14.s, p7/M, z14.s, z0.s\n" + "fmax z15.s, p7/M, z15.s, z0.s\n" + "fmax z16.s, p7/M, z16.s, z0.s\n" + "fmax z17.s, p7/M, z17.s, z0.s\n" + "fmax z18.s, p7/M, z18.s, z0.s\n" + "fmax z8.s, p7/M, z8.s, z0.s\n" + "fmax z9.s, p7/M, z9.s, z0.s\n" + "fmax z10.s, p7/M, z10.s, z0.s\n" + "fmax z11.s, p7/M, z11.s, z0.s\n" + "fmax z12.s, p7/M, z12.s, z0.s\n" + "fmax z13.s, p7/M, z13.s, z0.s\n" + "25:" // Height 2: No activation + "st1w { z4.s }, p6, [x26]\n" + "st1w { z14.s }, p5, [x26, #1, MUL VL]\n" + "st1w { z15.s }, p4, [x26, #2, MUL VL]\n" + "st1w { z16.s }, p3, [x26, #3, MUL VL]\n" + "st1w { z17.s }, p2, [x26, #4, MUL VL]\n" + "st1w { z18.s }, p1, [x26, #5, MUL VL]\n" + "addvl x26, x26, #6\n" + "st1w { z8.s }, p6, [x22]\n" + "st1w { z9.s }, p5, [x22, #1, MUL VL]\n" + "st1w { z10.s }, p4, [x22, #2, MUL VL]\n" + "st1w { z11.s }, p3, [x22, #3, MUL VL]\n" + "st1w { z12.s }, p2, [x22, #4, MUL VL]\n" + "st1w { z13.s }, p1, [x22, #5, MUL VL]\n" + "26:" // Height 2: Writeback done + "decw x28, ALL, MUL #6\n" + "cmp x28, XZR\n" + "bgt 15b\n" + "b 54f\n" + "27:" // Height 3 + "mov x9, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "28:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p6.s, x19, x28\n" + "incw x19\n" + "whilelt p5.s, x19, x28\n" + "incw x19\n" + "whilelt p4.s, x19, x28\n" + "incw x19\n" + "whilelt p3.s, x19, x28\n" + "incw x19\n" + "whilelt p2.s, x19, x28\n" + "incw x19\n" + "whilelt p1.s, x19, x28\n" + "cbz x9, 29f\n" + "ld1w { z8.s }, p7/Z, [x9]\n" + "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n" + "zip2 z14.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n" + "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n" + "zip2 z15.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n" + "zip2 z16.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "addvl x9, x9, #6\n" + "zip2 z17.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "zip2 z18.d, z12.d, z12.d\n" + "zip1 z12.d, z12.d, z12.d\n" + "zip2 z19.d, z13.d, z13.d\n" + "zip1 z13.d, z13.d, z13.d\n" + "mov z20.d, z8.d\n" + "mov z26.d, z14.d\n" + "mov z21.d, z9.d\n" + "mov z27.d, z15.d\n" + "mov z22.d, z10.d\n" + "mov z28.d, z16.d\n" + "mov z23.d, z11.d\n" + "mov z29.d, z17.d\n" + "mov z24.d, z12.d\n" + "mov z30.d, z18.d\n" + "mov z25.d, z13.d\n" + "mov z31.d, z19.d\n" + "b 31f\n" + "29:" // Height 3: no bias + "tbz %x[flags], #0, 30f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z9.s }, p6/Z, [x26]\n" + "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n" + "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n" + "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n" + "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n" + "ld1w { z14.s }, p6/Z, [x22]\n" + "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z14.d\n" + "zip2 z14.d, z9.d, z14.d\n" + "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n" + "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z15.d\n" + "zip2 z15.d, z10.d, z15.d\n" + "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n" + "zip1 z10.d, z11.d, z16.d\n" + "zip2 z16.d, z11.d, z16.d\n" + "ld1w { z21.s }, p6/Z, [x21]\n" + "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n" + "zip1 z11.d, z12.d, z17.d\n" + "zip2 z17.d, z12.d, z17.d\n" + "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n" + "zip1 z12.d, z13.d, z18.d\n" + "zip2 z18.d, z13.d, z18.d\n" + "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n" + "ld1w { z4.s }, p1/Z, [x21, #5, MUL VL]\n" + "zip1 z13.d, z20.d, z19.d\n" + "zip2 z19.d, z20.d, z19.d\n" + "zip1 z20.d, z21.d, z26.d\n" + "zip2 z26.d, z21.d, z26.d\n" + "zip1 z21.d, z22.d, z27.d\n" + "zip2 z27.d, z22.d, z27.d\n" + "zip1 z22.d, z23.d, z28.d\n" + "zip2 z28.d, z23.d, z28.d\n" + "zip1 z23.d, z24.d, z29.d\n" + "zip2 z29.d, z24.d, z29.d\n" + "zip1 z24.d, z25.d, z30.d\n" + "zip2 z30.d, z25.d, z30.d\n" + "zip1 z25.d, z4.d, z31.d\n" + "zip2 z31.d, z4.d, z31.d\n" + "b 31f\n" + "30:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "31:" // Height 3: setup done + "mov x25, #0x0\n" + "32:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w24, [x19, x25, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 34f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "34:" // Height 3: input setup done + "cmp x24, #0x4\n" + "ble 36f\n" + "35:" // Height 3: Multiply loop: Main loop head + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x22]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" + ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "uzp1 z1.h, z1.h, z1.h\n" + ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "trn1 z0.d, z0.d, z1.d\n" + "uzp1 z2.h, z2.h, z2.h\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + "sub x24, x24, #0x4\n" + ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + "cmp x24, #0x4\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + "add x23, x23, #0x10\n" + ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + "add x22, x22, #0x10\n" + ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + "addvl x27, x27, #-4\n" + ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + "bgt 35b\n" + "36:" // Height 3: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x22]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" + ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "uzp1 z1.h, z1.h, z1.h\n" + ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "trn1 z0.d, z0.d, z1.d\n" + "uzp1 z2.h, z2.h, z2.h\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + "addvl x27, x27, #-4\n" + ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + "37:" // Height 3: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19, LSL #2\n" + "uzp1 z4.d, z8.d, z14.d\n" + "uzp2 z8.d, z8.d, z14.d\n" + "uzp1 z14.d, z9.d, z15.d\n" + "uzp2 z9.d, z9.d, z15.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 z15.d, z10.d, z16.d\n" + "uzp2 z10.d, z10.d, z16.d\n" + "uzp1 z16.d, z11.d, z17.d\n" + "uzp2 z11.d, z11.d, z17.d\n" + "uzp1 z17.d, z12.d, z18.d\n" + "uzp2 z12.d, z12.d, z18.d\n" + "uzp1 z18.d, z13.d, z19.d\n" + "uzp2 z13.d, z13.d, z19.d\n" + "uzp1 z20.d, z20.d, z26.d\n" + "uzp1 z21.d, z21.d, z27.d\n" + "uzp1 z22.d, z22.d, z28.d\n" + "uzp1 z23.d, z23.d, z29.d\n" + "uzp1 z24.d, z24.d, z30.d\n" + "uzp1 z25.d, z25.d, z31.d\n" + "tbz %x[flags], #1, 38f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p7/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p7/Z, [x19]\n" + "fmin z4.s, p7/M, z4.s, z1.s\n" + "fmin z14.s, p7/M, z14.s, z1.s\n" + "fmin z15.s, p7/M, z15.s, z1.s\n" + "fmin z16.s, p7/M, z16.s, z1.s\n" + "fmin z17.s, p7/M, z17.s, z1.s\n" + "fmin z18.s, p7/M, z18.s, z1.s\n" + "fmin z8.s, p7/M, z8.s, z1.s\n" + "fmin z9.s, p7/M, z9.s, z1.s\n" + "fmin z10.s, p7/M, z10.s, z1.s\n" + "fmin z11.s, p7/M, z11.s, z1.s\n" + "fmin z12.s, p7/M, z12.s, z1.s\n" + "fmin z13.s, p7/M, z13.s, z1.s\n" + "fmin z20.s, p7/M, z20.s, z1.s\n" + "fmin z21.s, p7/M, z21.s, z1.s\n" + "fmin z22.s, p7/M, z22.s, z1.s\n" + "fmin z23.s, p7/M, z23.s, z1.s\n" + "fmin z24.s, p7/M, z24.s, z1.s\n" + "fmin z25.s, p7/M, z25.s, z1.s\n" + "fmax z4.s, p7/M, z4.s, z0.s\n" + "fmax z14.s, p7/M, z14.s, z0.s\n" + "fmax z15.s, p7/M, z15.s, z0.s\n" + "fmax z16.s, p7/M, z16.s, z0.s\n" + "fmax z17.s, p7/M, z17.s, z0.s\n" + "fmax z18.s, p7/M, z18.s, z0.s\n" + "fmax z8.s, p7/M, z8.s, z0.s\n" + "fmax z9.s, p7/M, z9.s, z0.s\n" + "fmax z10.s, p7/M, z10.s, z0.s\n" + "fmax z11.s, p7/M, z11.s, z0.s\n" + "fmax z12.s, p7/M, z12.s, z0.s\n" + "fmax z13.s, p7/M, z13.s, z0.s\n" + "fmax z20.s, p7/M, z20.s, z0.s\n" + "fmax z21.s, p7/M, z21.s, z0.s\n" + "fmax z22.s, p7/M, z22.s, z0.s\n" + "fmax z23.s, p7/M, z23.s, z0.s\n" + "fmax z24.s, p7/M, z24.s, z0.s\n" + "fmax z25.s, p7/M, z25.s, z0.s\n" + "38:" // Height 3: No activation + "st1w { z4.s }, p6, [x26]\n" + "st1w { z14.s }, p5, [x26, #1, MUL VL]\n" + "st1w { z15.s }, p4, [x26, #2, MUL VL]\n" + "st1w { z16.s }, p3, [x26, #3, MUL VL]\n" + "st1w { z17.s }, p2, [x26, #4, MUL VL]\n" + "st1w { z18.s }, p1, [x26, #5, MUL VL]\n" + "addvl x26, x26, #6\n" + "st1w { z8.s }, p6, [x22]\n" + "st1w { z9.s }, p5, [x22, #1, MUL VL]\n" + "st1w { z10.s }, p4, [x22, #2, MUL VL]\n" + "st1w { z11.s }, p3, [x22, #3, MUL VL]\n" + "st1w { z12.s }, p2, [x22, #4, MUL VL]\n" + "st1w { z13.s }, p1, [x22, #5, MUL VL]\n" + "st1w { z20.s }, p6, [x21]\n" + "st1w { z21.s }, p5, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p4, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p3, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p2, [x21, #4, MUL VL]\n" + "st1w { z25.s }, p1, [x21, #5, MUL VL]\n" + "39:" // Height 3: Writeback done + "decw x28, ALL, MUL #6\n" + "cmp x28, XZR\n" + "bgt 28b\n" + "b 54f\n" + "40:" // Height 4 + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x10\n" + "mov x9, %x[bias]\n" + "ldr x28, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x26, %x[output_ptr]\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "41:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p6.s, x19, x28\n" + "incw x19\n" + "whilelt p5.s, x19, x28\n" + "incw x19\n" + "whilelt p4.s, x19, x28\n" + "incw x19\n" + "whilelt p3.s, x19, x28\n" + "incw x19\n" + "whilelt p2.s, x19, x28\n" + "incw x19\n" + "whilelt p1.s, x19, x28\n" + "cbz x9, 42f\n" + "ld1w { z8.s }, p7/Z, [x9]\n" + "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n" + "zip2 z14.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n" + "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n" + "zip2 z15.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n" + "zip2 z16.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "addvl x9, x9, #6\n" + "zip2 z17.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "zip2 z18.d, z12.d, z12.d\n" + "zip1 z12.d, z12.d, z12.d\n" + "zip2 z19.d, z13.d, z13.d\n" + "zip1 z13.d, z13.d, z13.d\n" + "mov z20.d, z8.d\n" + "mov z26.d, z14.d\n" + "mov z21.d, z9.d\n" + "mov z27.d, z15.d\n" + "mov z22.d, z10.d\n" + "mov z28.d, z16.d\n" + "mov z23.d, z11.d\n" + "mov z29.d, z17.d\n" + "mov z24.d, z12.d\n" + "mov z30.d, z18.d\n" + "mov z25.d, z13.d\n" + "mov z31.d, z19.d\n" + "b 44f\n" + "42:" // Height 4: no bias + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z9.s }, p6/Z, [x26]\n" + "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n" + "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n" + "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n" + "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n" + "ld1w { z14.s }, p6/Z, [x22]\n" + "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z14.d\n" + "zip2 z14.d, z9.d, z14.d\n" + "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n" + "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z15.d\n" + "zip2 z15.d, z10.d, z15.d\n" + "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n" + "zip1 z10.d, z11.d, z16.d\n" + "zip2 z16.d, z11.d, z16.d\n" + "ld1w { z21.s }, p6/Z, [x21]\n" + "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n" + "zip1 z11.d, z12.d, z17.d\n" + "zip2 z17.d, z12.d, z17.d\n" + "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n" + "zip1 z12.d, z13.d, z18.d\n" + "zip2 z18.d, z13.d, z18.d\n" + "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n" + "ld1w { z4.s }, p1/Z, [x21, #5, MUL VL]\n" + "zip1 z13.d, z20.d, z19.d\n" + "zip2 z19.d, z20.d, z19.d\n" + "ld1w { z26.s }, p6/Z, [x20]\n" + "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n" + "zip1 z20.d, z21.d, z26.d\n" + "zip2 z26.d, z21.d, z26.d\n" + "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n" + "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n" + "zip1 z21.d, z22.d, z27.d\n" + "zip2 z27.d, z22.d, z27.d\n" + "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n" + "zip1 z22.d, z23.d, z28.d\n" + "zip2 z28.d, z23.d, z28.d\n" + "zip1 z23.d, z24.d, z29.d\n" + "zip2 z29.d, z24.d, z29.d\n" + "zip1 z24.d, z25.d, z30.d\n" + "zip2 z30.d, z25.d, z30.d\n" + "zip1 z25.d, z4.d, z31.d\n" + "zip2 z31.d, z4.d, z31.d\n" + "b 44f\n" + "43:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "44:" // Height 4: setup done + "mov x25, #0x0\n" + "45:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w24, [x19, x25, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x25, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 47f\n" + "46:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "47:" // Height 4: input setup done + "cmp x24, #0x4\n" + "ble 49f\n" + "48:" // Height 4: Multiply loop: Main loop head + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x22]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" + "ld1rqw { z3.s }, p0/Z, [x20]\n" + ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" + ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + "uzp1 z3.h, z3.h, z3.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + "sub x24, x24, #0x4\n" + "trn1 z2.d, z2.d, z3.d\n" + ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + "cmp x24, #0x4\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + "add x23, x23, #0x10\n" + ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + "add x22, x22, #0x10\n" + ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + "add x20, x20, #0x10\n" + ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + "addvl x27, x27, #-4\n" + ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + "bgt 48b\n" + "49:" // Height 4: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x24\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x22]\n" + ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" + "ld1rqw { z3.s }, p0/Z, [x20]\n" + ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" + ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z4.h }, p7/Z, [x27]\n" + "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n" + "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n" + "uzp1 z3.h, z3.h, z3.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + "trn1 z2.d, z2.d, z3.d\n" + ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" + ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n" + ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n" + ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n" + ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #16\n" + ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" + ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" + "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n" + ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n" + ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" + "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n" + ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" + "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + "addvl x27, x27, #-4\n" + ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" + ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" + ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" + ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + "50:" // Height 4: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "uzp1 z4.d, z8.d, z14.d\n" + "uzp2 z8.d, z8.d, z14.d\n" + "uzp1 z14.d, z9.d, z15.d\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z15.d\n" + "uzp1 z15.d, z10.d, z16.d\n" + "uzp2 z10.d, z10.d, z16.d\n" + "uzp1 z16.d, z11.d, z17.d\n" + "uzp2 z11.d, z11.d, z17.d\n" + "uzp1 z17.d, z12.d, z18.d\n" + "uzp2 z12.d, z12.d, z18.d\n" + "uzp1 z18.d, z13.d, z19.d\n" + "uzp2 z13.d, z13.d, z19.d\n" + "uzp1 z19.d, z20.d, z26.d\n" + "uzp2 z20.d, z20.d, z26.d\n" + "uzp1 z26.d, z21.d, z27.d\n" + "uzp2 z21.d, z21.d, z27.d\n" + "uzp1 z27.d, z22.d, z28.d\n" + "uzp2 z22.d, z22.d, z28.d\n" + "uzp1 z28.d, z23.d, z29.d\n" + "uzp2 z23.d, z23.d, z29.d\n" + "uzp1 z29.d, z24.d, z30.d\n" + "uzp2 z24.d, z24.d, z30.d\n" + "uzp1 z30.d, z25.d, z31.d\n" + "uzp2 z25.d, z25.d, z31.d\n" + "tbz %x[flags], #1, 51f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p7/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p7/Z, [x19]\n" + "fmin z4.s, p7/M, z4.s, z1.s\n" + "fmin z14.s, p7/M, z14.s, z1.s\n" + "fmin z15.s, p7/M, z15.s, z1.s\n" + "fmin z16.s, p7/M, z16.s, z1.s\n" + "fmin z17.s, p7/M, z17.s, z1.s\n" + "fmin z18.s, p7/M, z18.s, z1.s\n" + "fmin z8.s, p7/M, z8.s, z1.s\n" + "fmin z9.s, p7/M, z9.s, z1.s\n" + "fmin z10.s, p7/M, z10.s, z1.s\n" + "fmin z11.s, p7/M, z11.s, z1.s\n" + "fmin z12.s, p7/M, z12.s, z1.s\n" + "fmin z13.s, p7/M, z13.s, z1.s\n" + "fmin z19.s, p7/M, z19.s, z1.s\n" + "fmin z26.s, p7/M, z26.s, z1.s\n" + "fmin z27.s, p7/M, z27.s, z1.s\n" + "fmin z28.s, p7/M, z28.s, z1.s\n" + "fmin z29.s, p7/M, z29.s, z1.s\n" + "fmin z30.s, p7/M, z30.s, z1.s\n" + "fmin z20.s, p7/M, z20.s, z1.s\n" + "fmin z21.s, p7/M, z21.s, z1.s\n" + "fmin z22.s, p7/M, z22.s, z1.s\n" + "fmin z23.s, p7/M, z23.s, z1.s\n" + "fmin z24.s, p7/M, z24.s, z1.s\n" + "fmin z25.s, p7/M, z25.s, z1.s\n" + "fmax z4.s, p7/M, z4.s, z0.s\n" + "fmax z14.s, p7/M, z14.s, z0.s\n" + "fmax z15.s, p7/M, z15.s, z0.s\n" + "fmax z16.s, p7/M, z16.s, z0.s\n" + "fmax z17.s, p7/M, z17.s, z0.s\n" + "fmax z18.s, p7/M, z18.s, z0.s\n" + "fmax z8.s, p7/M, z8.s, z0.s\n" + "fmax z9.s, p7/M, z9.s, z0.s\n" + "fmax z10.s, p7/M, z10.s, z0.s\n" + "fmax z11.s, p7/M, z11.s, z0.s\n" + "fmax z12.s, p7/M, z12.s, z0.s\n" + "fmax z13.s, p7/M, z13.s, z0.s\n" + "fmax z19.s, p7/M, z19.s, z0.s\n" + "fmax z26.s, p7/M, z26.s, z0.s\n" + "fmax z27.s, p7/M, z27.s, z0.s\n" + "fmax z28.s, p7/M, z28.s, z0.s\n" + "fmax z29.s, p7/M, z29.s, z0.s\n" + "fmax z30.s, p7/M, z30.s, z0.s\n" + "fmax z20.s, p7/M, z20.s, z0.s\n" + "fmax z21.s, p7/M, z21.s, z0.s\n" + "fmax z22.s, p7/M, z22.s, z0.s\n" + "fmax z23.s, p7/M, z23.s, z0.s\n" + "fmax z24.s, p7/M, z24.s, z0.s\n" + "fmax z25.s, p7/M, z25.s, z0.s\n" + "51:" // Height 4: No activation + "st1w { z4.s }, p6, [x26]\n" + "st1w { z14.s }, p5, [x26, #1, MUL VL]\n" + "st1w { z15.s }, p4, [x26, #2, MUL VL]\n" + "st1w { z16.s }, p3, [x26, #3, MUL VL]\n" + "st1w { z17.s }, p2, [x26, #4, MUL VL]\n" + "st1w { z18.s }, p1, [x26, #5, MUL VL]\n" + "addvl x26, x26, #6\n" + "st1w { z8.s }, p6, [x22]\n" + "st1w { z9.s }, p5, [x22, #1, MUL VL]\n" + "st1w { z10.s }, p4, [x22, #2, MUL VL]\n" + "st1w { z11.s }, p3, [x22, #3, MUL VL]\n" + "st1w { z12.s }, p2, [x22, #4, MUL VL]\n" + "st1w { z13.s }, p1, [x22, #5, MUL VL]\n" + "st1w { z19.s }, p6, [x21]\n" + "st1w { z26.s }, p5, [x21, #1, MUL VL]\n" + "st1w { z27.s }, p4, [x21, #2, MUL VL]\n" + "st1w { z28.s }, p3, [x21, #3, MUL VL]\n" + "st1w { z29.s }, p2, [x21, #4, MUL VL]\n" + "st1w { z30.s }, p1, [x21, #5, MUL VL]\n" + "st1w { z20.s }, p6, [x20]\n" + "st1w { z21.s }, p5, [x20, #1, MUL VL]\n" + "st1w { z22.s }, p4, [x20, #2, MUL VL]\n" + "st1w { z23.s }, p3, [x20, #3, MUL VL]\n" + "st1w { z24.s }, p2, [x20, #4, MUL VL]\n" + "st1w { z25.s }, p1, [x20, #5, MUL VL]\n" + "52:" // Height 4: Writeback done + "decw x28, ALL, MUL #6\n" + "cmp x28, XZR\n" + "bgt 41b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 54f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 53f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "53:" // Update direct input + "mov x19, #0x10\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "54:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp new file mode 100644 index 0000000000..d941ccc0e9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_fp32bf16fp32_mmla_6x4VL( ARGLIST ); + +class cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL +{ +public: + typedef float lhs_operand_type; + typedef bfloat16 rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 14.06 }; + case CPUModel::A510: + return { 5.31 }; + case CPUModel::V1: + return { 26.64 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32bf16fp32_mmla_6x4VL; + cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp new file mode 100644 index 0000000000..236eebad66 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp @@ -0,0 +1,1793 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 66f\n" + "cmp %x[M], #0x4\n" + "bgt 53f\n" + "beq 40f\n" + "cmp %x[M], #0x2\n" + "bgt 27f\n" + "beq 14f\n" + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "cbz x11, 3f\n" + "ld1w { z8.s }, p5/Z, [x11]\n" + "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n" + "zip2 z12.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "addvl x11, x11, #4\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "5:" // Height 1: setup done + "mov x27, #0x0\n" + "6:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x26, #0x4\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "sub x26, x26, #0x4\n" + "cmp x26, #0x4\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "add x25, x25, #0x10\n" + "addvl x9, x9, #8\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "addvl x9, x9, #8\n" + "11:" // Height 1: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 6b\n" + "uzp1 z8.d, z8.d, z12.d\n" + "uzp1 z9.d, z9.d, z13.d\n" + "uzp1 z10.d, z10.d, z14.d\n" + "uzp1 z11.d, z11.d, z15.d\n" + "tbz %x[flags], #1, 12f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z1.s\n" + "fmin z10.s, p5/M, z10.s, z1.s\n" + "fmin z11.s, p5/M, z11.s, z1.s\n" + "fmax z8.s, p5/M, z8.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z0.s\n" + "fmax z10.s, p5/M, z10.s, z0.s\n" + "fmax z11.s, p5/M, z11.s, z0.s\n" + "12:" // Height 1: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "13:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 80f\n" + "14:" // Height 2 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "15:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "cbz x11, 16f\n" + "ld1w { z8.s }, p5/Z, [x11]\n" + "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n" + "zip2 z12.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "addvl x11, x11, #4\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "b 18f\n" + "16:" // Height 2: no bias + "tbz %x[flags], #0, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z9.d, z12.d\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 18f\n" + "17:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "18:" // Height 2: setup done + "mov x27, #0x0\n" + "19:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 21f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "21:" // Height 2: input setup done + "cmp x26, #0x4\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "sub x26, x26, #0x4\n" + "cmp x26, #0x4\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "addvl x9, x9, #8\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "addvl x9, x9, #8\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "24:" // Height 2: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 19b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp1 z6.d, z8.d, z12.d\n" + "uzp2 z8.d, z8.d, z12.d\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "tbz %x[flags], #1, 25f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z6.s, p5/M, z6.s, z1.s\n" + "fmin z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z1.s\n" + "fmin z14.s, p5/M, z14.s, z1.s\n" + "fmin z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z1.s\n" + "fmin z10.s, p5/M, z10.s, z1.s\n" + "fmin z11.s, p5/M, z11.s, z1.s\n" + "fmax z6.s, p5/M, z6.s, z0.s\n" + "fmax z12.s, p5/M, z12.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z0.s\n" + "fmax z14.s, p5/M, z14.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z0.s\n" + "fmax z10.s, p5/M, z10.s, z0.s\n" + "fmax z11.s, p5/M, z11.s, z0.s\n" + "25:" // Height 2: No activation + "st1w { z6.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "26:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 80f\n" + "27:" // Height 3 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "28:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "cbz x11, 29f\n" + "ld1w { z8.s }, p5/Z, [x11]\n" + "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n" + "zip2 z12.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "addvl x11, x11, #4\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z16.d, z8.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "b 31f\n" + "29:" // Height 3: no bias + "tbz %x[flags], #0, 30f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z12.d\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "zip1 z16.d, z17.d, z20.d\n" + "zip2 z20.d, z17.d, z20.d\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 31f\n" + "30:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "31:" // Height 3: setup done + "mov x27, #0x0\n" + "32:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 34f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "34:" // Height 3: input setup done + "cmp x26, #0x4\n" + "ble 36f\n" + "35:" // Height 3: Multiply loop: Main loop head + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "uzp1 z1.h, z1.h, z1.h\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "trn1 z0.d, z0.d, z1.d\n" + "uzp1 z2.h, z2.h, z2.h\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "cmp x26, #0x4\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x10\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "add x24, x24, #0x10\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + "add x23, x23, #0x10\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + "bgt 35b\n" + "36:" // Height 3: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "uzp1 z1.h, z1.h, z1.h\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "trn1 z0.d, z0.d, z1.d\n" + "uzp1 z2.h, z2.h, z2.h\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + "37:" // Height 3: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "uzp1 z6.d, z8.d, z12.d\n" + "uzp2 z8.d, z8.d, z12.d\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z16.d, z16.d, z20.d\n" + "uzp1 z17.d, z17.d, z21.d\n" + "uzp1 z18.d, z18.d, z22.d\n" + "uzp1 z19.d, z19.d, z23.d\n" + "tbz %x[flags], #1, 38f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z6.s, p5/M, z6.s, z1.s\n" + "fmin z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z1.s\n" + "fmin z14.s, p5/M, z14.s, z1.s\n" + "fmin z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z1.s\n" + "fmin z10.s, p5/M, z10.s, z1.s\n" + "fmin z11.s, p5/M, z11.s, z1.s\n" + "fmin z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z1.s\n" + "fmin z18.s, p5/M, z18.s, z1.s\n" + "fmin z19.s, p5/M, z19.s, z1.s\n" + "fmax z6.s, p5/M, z6.s, z0.s\n" + "fmax z12.s, p5/M, z12.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z0.s\n" + "fmax z14.s, p5/M, z14.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z0.s\n" + "fmax z10.s, p5/M, z10.s, z0.s\n" + "fmax z11.s, p5/M, z11.s, z0.s\n" + "fmax z16.s, p5/M, z16.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z0.s\n" + "fmax z18.s, p5/M, z18.s, z0.s\n" + "fmax z19.s, p5/M, z19.s, z0.s\n" + "38:" // Height 3: No activation + "st1w { z6.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "39:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 80f\n" + "40:" // Height 4 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "41:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "cbz x11, 42f\n" + "ld1w { z8.s }, p5/Z, [x11]\n" + "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n" + "zip2 z12.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "addvl x11, x11, #4\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z16.d, z8.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "b 44f\n" + "42:" // Height 4: no bias + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z12.d\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 44f\n" + "43:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "44:" // Height 4: setup done + "mov x27, #0x0\n" + "45:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 47f\n" + "46:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "47:" // Height 4: input setup done + "cmp x26, #0x4\n" + "ble 49f\n" + "48:" // Height 4: Multiply loop: Main loop head + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "sub x26, x26, #0x4\n" + "cmp x26, #0x4\n" + "uzp1 z3.h, z3.h, z3.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "add x25, x25, #0x10\n" + "trn1 z2.d, z2.d, z3.d\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "add x24, x24, #0x10\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + "add x23, x23, #0x10\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + "add x22, x22, #0x10\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + "bgt 48b\n" + "49:" // Height 4: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "uzp1 z3.h, z3.h, z3.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "trn1 z2.d, z2.d, z3.d\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + "50:" // Height 4: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z6.d, z8.d, z12.d\n" + "uzp2 z8.d, z8.d, z12.d\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "tbz %x[flags], #1, 51f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z6.s, p5/M, z6.s, z1.s\n" + "fmin z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z1.s\n" + "fmin z14.s, p5/M, z14.s, z1.s\n" + "fmin z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z1.s\n" + "fmin z10.s, p5/M, z10.s, z1.s\n" + "fmin z11.s, p5/M, z11.s, z1.s\n" + "fmin z15.s, p5/M, z15.s, z1.s\n" + "fmin z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z1.s\n" + "fmin z22.s, p5/M, z22.s, z1.s\n" + "fmin z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z1.s\n" + "fmin z18.s, p5/M, z18.s, z1.s\n" + "fmin z19.s, p5/M, z19.s, z1.s\n" + "fmax z6.s, p5/M, z6.s, z0.s\n" + "fmax z12.s, p5/M, z12.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z0.s\n" + "fmax z14.s, p5/M, z14.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z0.s\n" + "fmax z10.s, p5/M, z10.s, z0.s\n" + "fmax z11.s, p5/M, z11.s, z0.s\n" + "fmax z15.s, p5/M, z15.s, z0.s\n" + "fmax z20.s, p5/M, z20.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z0.s\n" + "fmax z22.s, p5/M, z22.s, z0.s\n" + "fmax z16.s, p5/M, z16.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z0.s\n" + "fmax z18.s, p5/M, z18.s, z0.s\n" + "fmax z19.s, p5/M, z19.s, z0.s\n" + "51:" // Height 4: No activation + "st1w { z6.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z15.s }, p4, [x23]\n" + "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "52:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" + "b 80f\n" + "53:" // Height 5 + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "54:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "cbz x11, 55f\n" + "ld1w { z8.s }, p5/Z, [x11]\n" + "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n" + "zip2 z12.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "addvl x11, x11, #4\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z16.d, z8.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "mov z24.d, z8.d\n" + "mov z28.d, z12.d\n" + "mov z25.d, z9.d\n" + "mov z29.d, z13.d\n" + "mov z26.d, z10.d\n" + "mov z30.d, z14.d\n" + "mov z27.d, z11.d\n" + "mov z31.d, z15.d\n" + "b 57f\n" + "55:" // Height 5: no bias + "tbz %x[flags], #0, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z12.d\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z25.s }, p4/Z, [x21]\n" + "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "zip1 z24.d, z25.d, z28.d\n" + "zip2 z28.d, z25.d, z28.d\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 57f\n" + "56:" // Height 5: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "57:" // Height 5: setup done + "mov x27, #0x0\n" + "58:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 60f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 60f\n" + "59:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "60:" // Height 5: input setup done + "cmp x26, #0x4\n" + "ble 62f\n" + "61:" // Height 5: Multiply loop: Main loop head + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "uzp1 z3.h, z3.h, z3.h\n" + ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + "cmp x26, #0x4\n" + "add x25, x25, #0x10\n" + "trn1 z0.d, z0.d, z1.d\n" + "trn1 z2.d, z2.d, z3.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "add x24, x24, #0x10\n" + "uzp1 z4.h, z4.h, z4.h\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "add x21, x21, #0x10\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + "bgt 61b\n" + "62:" // Height 5: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "uzp1 z3.h, z3.h, z3.h\n" + ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + "trn1 z0.d, z0.d, z1.d\n" + "trn1 z2.d, z2.d, z3.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "uzp1 z4.h, z4.h, z4.h\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + "63:" // Height 5: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 58b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z6.d, z8.d, z12.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z8.d, z8.d, z12.d\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z24.d, z24.d, z28.d\n" + "uzp1 z25.d, z25.d, z29.d\n" + "uzp1 z26.d, z26.d, z30.d\n" + "uzp1 z27.d, z27.d, z31.d\n" + "tbz %x[flags], #1, 64f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z6.s, p5/M, z6.s, z1.s\n" + "fmin z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z1.s\n" + "fmin z14.s, p5/M, z14.s, z1.s\n" + "fmin z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z1.s\n" + "fmin z10.s, p5/M, z10.s, z1.s\n" + "fmin z11.s, p5/M, z11.s, z1.s\n" + "fmin z15.s, p5/M, z15.s, z1.s\n" + "fmin z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z1.s\n" + "fmin z22.s, p5/M, z22.s, z1.s\n" + "fmin z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z1.s\n" + "fmin z18.s, p5/M, z18.s, z1.s\n" + "fmin z19.s, p5/M, z19.s, z1.s\n" + "fmin z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z1.s\n" + "fmin z26.s, p5/M, z26.s, z1.s\n" + "fmin z27.s, p5/M, z27.s, z1.s\n" + "fmax z6.s, p5/M, z6.s, z0.s\n" + "fmax z12.s, p5/M, z12.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z0.s\n" + "fmax z14.s, p5/M, z14.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z0.s\n" + "fmax z10.s, p5/M, z10.s, z0.s\n" + "fmax z11.s, p5/M, z11.s, z0.s\n" + "fmax z15.s, p5/M, z15.s, z0.s\n" + "fmax z20.s, p5/M, z20.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z0.s\n" + "fmax z22.s, p5/M, z22.s, z0.s\n" + "fmax z16.s, p5/M, z16.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z0.s\n" + "fmax z18.s, p5/M, z18.s, z0.s\n" + "fmax z19.s, p5/M, z19.s, z0.s\n" + "fmax z24.s, p5/M, z24.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z0.s\n" + "fmax z26.s, p5/M, z26.s, z0.s\n" + "fmax z27.s, p5/M, z27.s, z0.s\n" + "64:" // Height 5: No activation + "st1w { z6.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z15.s }, p4, [x23]\n" + "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x21]\n" + "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "65:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 54b\n" + "b 80f\n" + "66:" // Height 6 + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "mov x11, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "67:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "cbz x11, 68f\n" + "ld1w { z8.s }, p5/Z, [x11]\n" + "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n" + "zip2 z12.d, z8.d, z8.d\n" + "zip1 z8.d, z8.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n" + "zip2 z13.d, z9.d, z9.d\n" + "zip1 z9.d, z9.d, z9.d\n" + "addvl x11, x11, #4\n" + "zip2 z14.d, z10.d, z10.d\n" + "zip1 z10.d, z10.d, z10.d\n" + "zip2 z15.d, z11.d, z11.d\n" + "zip1 z11.d, z11.d, z11.d\n" + "mov z16.d, z8.d\n" + "mov z20.d, z12.d\n" + "mov z17.d, z9.d\n" + "mov z21.d, z13.d\n" + "mov z18.d, z10.d\n" + "mov z22.d, z14.d\n" + "mov z19.d, z11.d\n" + "mov z23.d, z15.d\n" + "mov z24.d, z8.d\n" + "mov z28.d, z12.d\n" + "mov z25.d, z9.d\n" + "mov z29.d, z13.d\n" + "mov z26.d, z10.d\n" + "mov z30.d, z14.d\n" + "mov z27.d, z11.d\n" + "mov z31.d, z15.d\n" + "b 70f\n" + "68:" // Height 6: no bias + "tbz %x[flags], #0, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip1 z8.d, z9.d, z12.d\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z25.s }, p4/Z, [x21]\n" + "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "ld1w { z28.s }, p4/Z, [x20]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "zip1 z24.d, z25.d, z28.d\n" + "zip2 z28.d, z25.d, z28.d\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 70f\n" + "69:" // Height 6: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "70:" // Height 6: setup done + "mov x27, #0x0\n" + "71:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 72f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 73f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 73f\n" + "72:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "73:" // Height 6: input setup done + "cmp x26, #0x4\n" + "ble 75f\n" + "74:" // Height 6: Multiply loop: Main loop head + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + "ld1rqw { z5.s }, p0/Z, [x20]\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "sub x26, x26, #0x4\n" + "cmp x26, #0x4\n" + "uzp1 z3.h, z3.h, z3.h\n" + "uzp1 z4.h, z4.h, z4.h\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "uzp1 z5.h, z5.h, z5.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "add x23, x23, #0x10\n" + "trn1 z2.d, z2.d, z3.d\n" + "trn1 z4.d, z4.d, z5.d\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + "add x22, x22, #0x10\n" + ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + "add x20, x20, #0x10\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + "bgt 74b\n" + "75:" // Height 6: Multiply loop: Single iteration only + "whilelt p0.s, XZR, x26\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + "ld1rqw { z5.s }, p0/Z, [x20]\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n" + "uzp1 z0.h, z0.h, z0.h\n" + "ld1h { z6.h }, p5/Z, [x9]\n" + "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" + "uzp1 z1.h, z1.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "uzp1 z3.h, z3.h, z3.h\n" + "uzp1 z4.h, z4.h, z4.h\n" + "uzp1 z5.h, z5.h, z5.h\n" + "trn1 z0.d, z0.d, z1.d\n" + ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "trn1 z2.d, z2.d, z3.d\n" + "trn1 z4.d, z4.d, z5.d\n" + ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" + ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" + ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" + ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" + ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" + ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" + "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" + ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" + "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" + ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" + ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + "76:" // Height 6: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 71b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "uzp1 z6.d, z8.d, z12.d\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z8.d, z8.d, z12.d\n" + "uzp1 z12.d, z9.d, z13.d\n" + "uzp2 z9.d, z9.d, z13.d\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z23.d, z24.d, z28.d\n" + "uzp2 z24.d, z24.d, z28.d\n" + "uzp1 z28.d, z25.d, z29.d\n" + "uzp2 z25.d, z25.d, z29.d\n" + "uzp1 z29.d, z26.d, z30.d\n" + "uzp2 z26.d, z26.d, z30.d\n" + "uzp1 z30.d, z27.d, z31.d\n" + "uzp2 z27.d, z27.d, z31.d\n" + "tbz %x[flags], #1, 77f\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z6.s, p5/M, z6.s, z1.s\n" + "fmin z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z1.s\n" + "fmin z14.s, p5/M, z14.s, z1.s\n" + "fmin z8.s, p5/M, z8.s, z1.s\n" + "fmin z9.s, p5/M, z9.s, z1.s\n" + "fmin z10.s, p5/M, z10.s, z1.s\n" + "fmin z11.s, p5/M, z11.s, z1.s\n" + "fmin z15.s, p5/M, z15.s, z1.s\n" + "fmin z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z1.s\n" + "fmin z22.s, p5/M, z22.s, z1.s\n" + "fmin z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z1.s\n" + "fmin z18.s, p5/M, z18.s, z1.s\n" + "fmin z19.s, p5/M, z19.s, z1.s\n" + "fmin z23.s, p5/M, z23.s, z1.s\n" + "fmin z28.s, p5/M, z28.s, z1.s\n" + "fmin z29.s, p5/M, z29.s, z1.s\n" + "fmin z30.s, p5/M, z30.s, z1.s\n" + "fmin z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z1.s\n" + "fmin z26.s, p5/M, z26.s, z1.s\n" + "fmin z27.s, p5/M, z27.s, z1.s\n" + "fmax z6.s, p5/M, z6.s, z0.s\n" + "fmax z12.s, p5/M, z12.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z0.s\n" + "fmax z14.s, p5/M, z14.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z0.s\n" + "fmax z9.s, p5/M, z9.s, z0.s\n" + "fmax z10.s, p5/M, z10.s, z0.s\n" + "fmax z11.s, p5/M, z11.s, z0.s\n" + "fmax z15.s, p5/M, z15.s, z0.s\n" + "fmax z20.s, p5/M, z20.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z0.s\n" + "fmax z22.s, p5/M, z22.s, z0.s\n" + "fmax z16.s, p5/M, z16.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z0.s\n" + "fmax z18.s, p5/M, z18.s, z0.s\n" + "fmax z19.s, p5/M, z19.s, z0.s\n" + "fmax z23.s, p5/M, z23.s, z0.s\n" + "fmax z28.s, p5/M, z28.s, z0.s\n" + "fmax z29.s, p5/M, z29.s, z0.s\n" + "fmax z30.s, p5/M, z30.s, z0.s\n" + "fmax z24.s, p5/M, z24.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z0.s\n" + "fmax z26.s, p5/M, z26.s, z0.s\n" + "fmax z27.s, p5/M, z27.s, z0.s\n" + "77:" // Height 6: No activation + "st1w { z6.s }, p4, [x28]\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x24]\n" + "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z15.s }, p4, [x23]\n" + "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z23.s }, p4, [x21]\n" + "st1w { z28.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z29.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z30.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "78:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 67b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 80f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 79f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "79:" // Update direct input + "mov x19, #0x18\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "80:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp index c278b3fc6b..c8a7d66f28 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp @@ -22,9 +22,10 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -42,7 +43,8 @@ void sve_hybrid_s8qa_dot_4x4VL( ARGLIST ); class cls_sve_hybrid_s8qa_dot_4x4VL { public: - typedef int8_t operand_type; + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; typedef int8_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,7 +70,22 @@ public: return false; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 29.89 }; + case CPUModel::A510: + return { 17.12 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_s8qa_dot_4x4VL; @@ -80,4 +97,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp index 8a7465ba6b..3031f5abf5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -158,7 +158,6 @@ void sve_hybrid_s8qa_dot_4x4VL ( "tbnz %x[flags], #31, 8f\n" "sdot z11.s, z0.b, z15.b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" "cmp x24, #0x10\n" "bgt 7b\n" @@ -170,7 +169,6 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" @@ -212,9 +210,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "tbnz %x[flags], #31, 11f\n" "sdot z11.s, z0.b, z15.b\n" "11:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 4b\n" "tbnz %x[flags], #31, 12f\n" @@ -251,16 +248,16 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" "tbz %x[flags], #5, 13f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" "and z7.d, z19.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z16.s, z16.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z17.s, z17.s, z5.s\n" "sqadd z18.s, z18.s, z6.s\n" + "asr z7.s, z7.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "13:" // Height 1: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -396,9 +393,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "22:" // Height 2: Multiply loop: unique 3: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" "cmp x24, #0x10\n" "bgt 21b\n" "23:" // Height 2: Multiply loop: Single iteration only @@ -409,12 +404,10 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x22, x22, #0x10\n" - "sdot z20.s, z4.b, z1.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z20.s, z4.b, z1.b[0]\n" "addvl x28, x28, #4\n" "sdot z21.s, z5.b, z1.b[0]\n" "sdot z18.s, z6.b, z0.b[0]\n" @@ -470,10 +463,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "25:" // Height 2: Multiply loop: unique 4: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 18b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -527,27 +518,27 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" "tbz %x[flags], #5, 27f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" "and z7.d, z19.d, z0.d\n" "and z8.d, z20.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "and z9.d, z21.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z10.d, z22.d, z0.d\n" "asr z8.s, z8.s, #0x1f\n" - "and z4.d, z23.d, z0.d\n" "asr z9.s, z9.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "asr z10.s, z10.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "sqadd z20.s, z20.s, z8.s\n" "sqadd z21.s, z21.s, z9.s\n" + "and z10.d, z22.d, z0.d\n" + "and z4.d, z23.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" "27:" // Height 2: no shift correction @@ -731,11 +722,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "36:" // Height 3: Multiply loop: unique 5: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" "cmp x24, #0x10\n" - "prfm pldl1keep, [x21, #0x80]\n" "bgt 35b\n" "37:" // Height 3: Multiply loop: Single iteration only "ld1b { z4.b }, p2/Z, [x28]\n" @@ -745,16 +733,13 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" - "sdot z20.s, z4.b, z1.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x21, x21, #0x10\n" - "sdot z24.s, z4.b, z2.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" + "sdot z24.s, z4.b, z2.b[0]\n" "sdot z21.s, z5.b, z1.b[0]\n" "sdot z25.s, z5.b, z2.b[0]\n" "sdot z18.s, z6.b, z0.b[0]\n" @@ -825,11 +810,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "39:" // Height 3: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -899,39 +881,39 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" "tbz %x[flags], #5, 41f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" "and z7.d, z19.d, z0.d\n" "and z8.d, z20.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "and z9.d, z21.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z10.d, z22.d, z0.d\n" "asr z8.s, z8.s, #0x1f\n" - "and z4.d, z23.d, z0.d\n" "asr z9.s, z9.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "asr z10.s, z10.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "asr z4.s, z4.s, #0x1f\n" - "and z5.d, z24.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "sqadd z20.s, z20.s, z8.s\n" "sqadd z21.s, z21.s, z9.s\n" + "and z10.d, z22.d, z0.d\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z24.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" - "and z6.d, z25.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z24.s, z24.s, z5.s\n" + "and z6.d, z25.d, z0.d\n" "and z7.d, z26.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" "and z8.d, z27.d, z0.d\n" - "sqadd z25.s, z25.s, z6.s\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z8.s, z8.s, #0x1f\n" + "sqadd z25.s, z25.s, z6.s\n" "sqadd z26.s, z26.s, z7.s\n" "sqadd z27.s, z27.s, z8.s\n" "41:" // Height 3: no shift correction @@ -1165,12 +1147,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z13.s, z2.b, z15.b\n" "sdot z14.s, z3.b, z15.b\n" "50:" // Height 4: Multiply loop: unique 7: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" "cmp x24, #0x10\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "bgt 49b\n" "51:" // Height 4: Multiply loop: Single iteration only "ld1b { z4.b }, p2/Z, [x28]\n" @@ -1180,19 +1158,15 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" - "sdot z20.s, z4.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" - "sdot z24.s, z4.b, z2.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x20, x20, #0x10\n" "sdot z21.s, z5.b, z1.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" + "sdot z24.s, z4.b, z2.b[0]\n" "sdot z28.s, z4.b, z3.b[0]\n" "sdot z25.s, z5.b, z2.b[0]\n" "sdot z29.s, z5.b, z3.b[0]\n" @@ -1279,12 +1253,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z13.s, z2.b, z15.b\n" "sdot z14.s, z3.b, z15.b\n" "53:" // Height 4: Multiply loop: unique 8: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 46b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1370,52 +1340,52 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" "tbz %x[flags], #5, 55f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" "and z7.d, z19.d, z0.d\n" "and z8.d, z20.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "and z9.d, z21.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z10.d, z22.d, z0.d\n" "asr z8.s, z8.s, #0x1f\n" - "and z4.d, z23.d, z0.d\n" "asr z9.s, z9.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "asr z10.s, z10.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "asr z4.s, z4.s, #0x1f\n" - "and z5.d, z24.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "sqadd z20.s, z20.s, z8.s\n" "sqadd z21.s, z21.s, z9.s\n" + "and z10.d, z22.d, z0.d\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z24.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" - "and z6.d, z25.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z24.s, z24.s, z5.s\n" + "and z6.d, z25.d, z0.d\n" "and z7.d, z26.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" "and z8.d, z27.d, z0.d\n" - "and z9.d, z28.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z8.s, z8.s, #0x1f\n" "sqadd z25.s, z25.s, z6.s\n" + "sqadd z26.s, z26.s, z7.s\n" + "sqadd z27.s, z27.s, z8.s\n" + "and z9.d, z28.d, z0.d\n" "and z10.d, z29.d, z0.d\n" - "asr z9.s, z9.s, #0x1f\n" "and z4.d, z30.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" "asr z10.s, z10.s, #0x1f\n" - "sqadd z26.s, z26.s, z7.s\n" - "and z5.d, z31.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" - "sqadd z27.s, z27.s, z8.s\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z28.s, z28.s, z9.s\n" "sqadd z29.s, z29.s, z10.s\n" "sqadd z30.s, z30.s, z4.s\n" + "and z5.d, z31.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" "sqadd z31.s, z31.s, z5.s\n" "55:" // Height 4: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -1529,4 +1499,4 @@ void sve_hybrid_s8qa_dot_4x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp new file mode 100644 index 0000000000..9681505e8c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_s8qa_mmla_4x4VL( ARGLIST ); + +class cls_sve_hybrid_s8qa_mmla_4x4VL +{ +public: + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 47.37 }; + case CPUModel::A510: + return { 20.88 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_s8qa_mmla_4x4VL; + cls_sve_hybrid_s8qa_mmla_4x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp new file mode 100644 index 0000000000..04f80982e8 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp @@ -0,0 +1,1418 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_s8qa_mmla_4x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 43f\n" + "cmp %x[M], #0x2\n" + "bgt 29f\n" + "beq 15f\n" + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov z15.b, #0x1\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "ble 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x23, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "tbnz %x[flags], #31, 8f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "subs x24, x24, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ble 10f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + "addvl x28, x28, #8\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "10:" // Height 1: Multiply loop: multiply skip + "tbnz %x[flags], #31, 11f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "11:" // Height 1: Multiply loop: unique 2: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 4b\n" + "uzp1 z16.d, z16.d, z20.d\n" + "uzp1 z17.d, z17.d, z21.d\n" + "uzp1 z18.d, z18.d, z22.d\n" + "uzp1 z19.d, z19.d, z23.d\n" + "mov z23.d, z16.d\n" + "tbnz %x[flags], #31, 12f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z1.s }, p2/Z, [x22]\n" + "mov z11.s, z11.s[0]\n" + "neg z1.s, p2/M, z1.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" + "12:" // Height 1: skip row sum fixup + "add z23.s, z23.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z23.s, z23.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "tbz %x[flags], #5, 13f\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "and z7.d, z19.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z23.s, z23.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "13:" // Height 1: no shift correction + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + "add z23.s, z23.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z23.h, z23.h, z17.h\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z23.b, z23.b, z17.b\n" + "st1b { z23.b }, p1, [x26]\n" + "addvl x26, x26, #1\n" + "14:" // Height 1: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 2b\n" + "b 58f\n" + "15:" // Height 2 + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z15.b, #0x1\n" + "mov x26, %x[output_ptr]\n" + "16:" // Height 2: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "17:" // Height 2: setup done + "mov x25, #0x0\n" + "18:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 20f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "20:" // Height 2: input setup done + "cmp x24, #0x10\n" + "ble 23f\n" + "21:" // Height 2: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "add x23, x23, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "tbnz %x[flags], #31, 22f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "22:" // Height 2: Multiply loop: unique 3: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 21b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ble 24f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + "addvl x28, x28, #8\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "24:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 25f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "25:" // Height 2: Multiply loop: unique 4: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 18b\n" + "uzp1 z7.d, z16.d, z20.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z16.d, z16.d, z20.d\n" + "add x21, x26, x19\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "mov z23.d, z7.d\n" + "tbnz %x[flags], #31, 26f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z2.s }, p2/Z, [x22]\n" + "mov z12.s, z11.s[3]\n" + "mov z11.s, z11.s[0]\n" + "neg z2.s, p2/M, z2.s\n" + "mul z11.s, p2/M, z11.s, z2.s\n" + "mul z12.s, p2/M, z12.s, z2.s\n" + "26:" // Height 2: skip row sum fixup + "add z23.s, z23.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z20.s, z20.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z21.s, z21.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z22.s, z22.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z16.s, z16.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z17.s, z17.s, z12.s\n" + "add z18.s, z18.s, z12.s\n" + "add z19.s, z19.s, z12.s\n" + "add z23.s, z23.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "tbz %x[flags], #5, 27f\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z6.d, z21.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z23.s, z23.s, z4.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "and z7.d, z22.d, z0.d\n" + "and z8.d, z16.d, z0.d\n" + "and z9.d, z17.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z22.s, z22.s, z7.s\n" + "sqadd z16.s, z16.s, z8.s\n" + "sqadd z17.s, z17.s, z9.s\n" + "and z10.d, z18.d, z0.d\n" + "and z4.d, z19.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z18.s, z18.s, z10.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "27:" // Height 2: no shift correction + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add z23.s, z23.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z23.h, z23.h, z20.h\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "uzp1 z23.b, z23.b, z20.b\n" + "st1b { z23.b }, p1, [x26]\n" + "add z17.s, z17.s, z4.s\n" + "addvl x26, x26, #1\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "28:" // Height 2: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 16b\n" + "b 58f\n" + "29:" // Height 3 + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "mov x26, %x[output_ptr]\n" + "mov z15.b, #0x1\n" + "30:" // Height 3: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "31:" // Height 3: setup done + "mov x25, #0x0\n" + "32:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 34f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "34:" // Height 3: input setup done + "cmp x24, #0x10\n" + "ble 37f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "add x23, x23, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 36f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "sdot z13.s, z3.b, z15.b\n" + "36:" // Height 3: Multiply loop: unique 5: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 35b\n" + "37:" // Height 3: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x24, x24, #0x8\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + "ble 38f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "38:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 39f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "sdot z13.s, z3.b, z15.b\n" + "39:" // Height 3: Multiply loop: unique 6: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 32b\n" + "uzp1 z7.d, z16.d, z20.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z16.d, z16.d, z20.d\n" + "add x21, x26, x19\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "add x20, x21, x19\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z24.d, z24.d, z28.d\n" + "uzp1 z25.d, z25.d, z29.d\n" + "uzp1 z26.d, z26.d, z30.d\n" + "uzp1 z27.d, z27.d, z31.d\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z3.s }, p2/Z, [x22]\n" + ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" + "mov z12.s, z11.s[3]\n" + "mov z11.s, z11.s[0]\n" + "neg z3.s, p2/M, z3.s\n" + "mov z13.s, z13.s[0]\n" + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" + "mul z13.s, p2/M, z13.s, z3.s\n" + "40:" // Height 3: skip row sum fixup + "add z31.s, z31.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z20.s, z20.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z21.s, z21.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z22.s, z22.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z16.s, z16.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z17.s, z17.s, z12.s\n" + "add z18.s, z18.s, z12.s\n" + "add z19.s, z19.s, z12.s\n" + "add z24.s, z24.s, z13.s\n" + "add z25.s, z25.s, z13.s\n" + "add z26.s, z26.s, z13.s\n" + "add z27.s, z27.s, z13.s\n" + "add z31.s, z31.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "tbz %x[flags], #5, 41f\n" + "and z4.d, z31.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z6.d, z21.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z31.s, z31.s, z4.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "and z7.d, z22.d, z0.d\n" + "and z8.d, z16.d, z0.d\n" + "and z9.d, z17.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z22.s, z22.s, z7.s\n" + "sqadd z16.s, z16.s, z8.s\n" + "sqadd z17.s, z17.s, z9.s\n" + "and z10.d, z18.d, z0.d\n" + "and z4.d, z19.d, z0.d\n" + "and z5.d, z24.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z18.s, z18.s, z10.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "sqadd z24.s, z24.s, z5.s\n" + "and z6.d, z25.d, z0.d\n" + "and z7.d, z26.d, z0.d\n" + "and z8.d, z27.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z25.s, z25.s, z6.s\n" + "sqadd z26.s, z26.s, z7.s\n" + "sqadd z27.s, z27.s, z8.s\n" + "41:" // Height 3: no shift correction + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z31.h, z31.h, z20.h\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "uzp1 z31.b, z31.b, z20.b\n" + "st1b { z31.b }, p1, [x26]\n" + "add z17.s, z17.s, z4.s\n" + "addvl x26, x26, #1\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "add z26.s, z26.s, z4.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x20]\n" + "42:" // Height 3: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 30b\n" + "b 58f\n" + "43:" // Height 4 + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" + "mov z14.s, #0x0\n" + "mov x19, #0x4\n" + "mov z15.b, #0x1\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "44:" // Height 4: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "45:" // Height 4: setup done + "mov x25, #0x0\n" + "46:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 47f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x25, 48f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 48f\n" + "47:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "48:" // Height 4: input setup done + "cmp x24, #0x10\n" + "ble 51f\n" + "49:" // Height 4: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "add x23, x23, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x20]\n" + "add x21, x21, #0x10\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x20, x20, #0x10\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 50f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "sdot z13.s, z3.b, z15.b\n" + "50:" // Height 4: Multiply loop: unique 7: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 49b\n" + "51:" // Height 4: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x20]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" + ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + "ble 52f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "52:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 53f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "sdot z11.s, z1.b, z15.b\n" + "sdot z13.s, z3.b, z15.b\n" + "53:" // Height 4: Multiply loop: unique 8: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 46b\n" + "uzp1 z7.d, z16.d, z20.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z16.d, z16.d, z20.d\n" + "add x21, x26, x19\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "add x20, x21, x19\n" + "uzp1 z21.d, z18.d, z22.d\n" + "add x19, x20, x19\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z23.d, z24.d, z28.d\n" + "uzp2 z24.d, z24.d, z28.d\n" + "uzp1 z28.d, z25.d, z29.d\n" + "uzp2 z25.d, z25.d, z29.d\n" + "uzp1 z29.d, z26.d, z30.d\n" + "uzp2 z26.d, z26.d, z30.d\n" + "uzp1 z30.d, z27.d, z31.d\n" + "uzp2 z27.d, z27.d, z31.d\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 54f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" + "mov z12.s, z11.s[3]\n" + "mov z11.s, z11.s[0]\n" + "neg z4.s, p2/M, z4.s\n" + "mov z14.s, z13.s[3]\n" + "mov z13.s, z13.s[0]\n" + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "54:" // Height 4: skip row sum fixup + "add z31.s, z31.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z20.s, z20.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z21.s, z21.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z22.s, z22.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z16.s, z16.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z17.s, z17.s, z12.s\n" + "add z18.s, z18.s, z12.s\n" + "add z19.s, z19.s, z12.s\n" + "add z23.s, z23.s, z13.s\n" + "add z28.s, z28.s, z13.s\n" + "add z29.s, z29.s, z13.s\n" + "add z30.s, z30.s, z13.s\n" + "add z24.s, z24.s, z14.s\n" + "add z25.s, z25.s, z14.s\n" + "add z26.s, z26.s, z14.s\n" + "add z27.s, z27.s, z14.s\n" + "add z31.s, z31.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z28.s, z28.s, z1.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" + ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "tbz %x[flags], #5, 55f\n" + "and z4.d, z31.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z6.d, z21.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z31.s, z31.s, z4.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "and z7.d, z22.d, z0.d\n" + "and z8.d, z16.d, z0.d\n" + "and z9.d, z17.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z22.s, z22.s, z7.s\n" + "sqadd z16.s, z16.s, z8.s\n" + "sqadd z17.s, z17.s, z9.s\n" + "and z10.d, z18.d, z0.d\n" + "and z4.d, z19.d, z0.d\n" + "and z5.d, z23.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z18.s, z18.s, z10.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "sqadd z23.s, z23.s, z5.s\n" + "and z6.d, z28.d, z0.d\n" + "and z7.d, z29.d, z0.d\n" + "and z8.d, z30.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z28.s, z28.s, z6.s\n" + "sqadd z29.s, z29.s, z7.s\n" + "sqadd z30.s, z30.s, z8.s\n" + "and z9.d, z24.d, z0.d\n" + "and z10.d, z25.d, z0.d\n" + "and z4.d, z26.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z24.s, z24.s, z9.s\n" + "sqadd z25.s, z25.s, z10.s\n" + "sqadd z26.s, z26.s, z4.s\n" + "and z5.d, z27.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z27.s, z27.s, z5.s\n" + "55:" // Height 4: no shift correction + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z31.h, z31.h, z20.h\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "uzp1 z31.b, z31.b, z20.b\n" + "st1b { z31.b }, p1, [x26]\n" + "add z17.s, z17.s, z4.s\n" + "addvl x26, x26, #1\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "add z29.s, z29.s, z4.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "add z30.s, z30.s, z4.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "uzp1 z23.h, z23.h, z28.h\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "add z25.s, z25.s, z4.s\n" + "uzp1 z28.h, z29.h, z30.h\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "add z26.s, z26.s, z4.s\n" + "uzp1 z23.b, z23.b, z28.b\n" + "st1b { z23.b }, p1, [x20]\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x19]\n" + "56:" // Height 4: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 44b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 58f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 57f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "57:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "58:" // Exit + + : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp index 57056b4c2a..dad04c81e8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp @@ -22,9 +22,10 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -42,7 +43,8 @@ void sve_hybrid_s8qs_dot_6x4VL( ARGLIST ); class cls_sve_hybrid_s8qs_dot_6x4VL { public: - typedef int8_t operand_type; + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; typedef int8_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,7 +70,22 @@ public: return false; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 30.13 }; + case CPUModel::A510: + return { 19.77 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_s8qs_dot_6x4VL; @@ -80,4 +97,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp index 0328c107e2..6b08d2834b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -135,13 +135,12 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "cmp x26, #0x10\n" + "add x25, x25, #0x10\n" "sdot z10.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" - "prfm pldl1keep, [x25, #0x80]\n" "sdot z11.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" @@ -176,7 +175,6 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" @@ -215,9 +213,8 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z10.s, z6.b, z0.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "9:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 4b\n" "ld1w { z0.s }, p2/Z, [x11]\n" @@ -259,16 +256,16 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" "tbz %x[flags], #5, 12f\n" "and z4.d, z8.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" "and z6.d, z10.d, z2.d\n" - "asr z5.s, z5.s, #0x1f\n" "and z7.d, z11.d, z3.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z8.s, z8.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z9.s, z9.s, z5.s\n" "sqadd z10.s, z10.s, z6.s\n" + "asr z7.s, z7.s, #0x1f\n" "sqadd z11.s, z11.s, z7.s\n" "12:" // Height 1: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" @@ -351,16 +348,14 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "cmp x26, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" @@ -411,9 +406,7 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z13.s, z7.b, z1.b[0]\n" @@ -468,10 +461,8 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" "22:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 17b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -523,27 +514,27 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" "tbz %x[flags], #5, 25f\n" "and z4.d, z8.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" "and z6.d, z10.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "and z7.d, z11.d, z3.d\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z8.s, z8.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" - "and z4.d, z12.d, z0.d\n" "sqadd z9.s, z9.s, z5.s\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z10.s, z10.s, z6.s\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z12.d, z0.d\n" "and z5.d, z13.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" "sqadd z11.s, z11.s, z7.s\n" - "and z6.d, z14.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z12.s, z12.s, z4.s\n" + "sqadd z13.s, z13.s, z5.s\n" + "and z6.d, z14.d, z2.d\n" "and z7.d, z15.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z13.s, z13.s, z5.s\n" "sqadd z14.s, z14.s, z6.s\n" "sqadd z15.s, z15.s, z7.s\n" "25:" // Height 2: no shift correction @@ -654,21 +645,18 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "cmp x26, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" "sdot z17.s, z7.b, z2.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -733,12 +721,9 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" @@ -808,11 +793,8 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" "35:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 30b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -873,37 +855,37 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" "tbz %x[flags], #5, 38f\n" "and z4.d, z8.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" "and z6.d, z10.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "and z7.d, z11.d, z3.d\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z8.s, z8.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" - "and z4.d, z12.d, z0.d\n" "sqadd z9.s, z9.s, z5.s\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z10.s, z10.s, z6.s\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z12.d, z0.d\n" "and z5.d, z13.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" "sqadd z11.s, z11.s, z7.s\n" - "and z6.d, z14.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z12.s, z12.s, z4.s\n" - "and z7.d, z15.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z13.s, z13.s, z5.s\n" + "and z6.d, z14.d, z2.d\n" + "and z7.d, z15.d, z3.d\n" "and z4.d, z16.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" "sqadd z14.s, z14.s, z6.s\n" - "and z5.d, z17.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z15.s, z15.s, z7.s\n" - "and z6.d, z18.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z16.s, z16.s, z4.s\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" "and z7.d, z19.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z7.s, z7.s, #0x1f\n" "sqadd z17.s, z17.s, z5.s\n" "sqadd z18.s, z18.s, z6.s\n" @@ -1043,26 +1025,22 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" + "add x22, x22, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "prfm pldl1keep, [x22, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -1141,19 +1119,15 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "add x22, x22, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z21.s, z7.b, z3.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" @@ -1234,12 +1208,8 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" "48:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 43b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1309,52 +1279,52 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" "tbz %x[flags], #5, 51f\n" "and z4.d, z8.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" "and z6.d, z10.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "and z7.d, z11.d, z3.d\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z8.s, z8.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" - "and z4.d, z12.d, z0.d\n" "sqadd z9.s, z9.s, z5.s\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z10.s, z10.s, z6.s\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z12.d, z0.d\n" "and z5.d, z13.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" "sqadd z11.s, z11.s, z7.s\n" - "and z6.d, z14.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z12.s, z12.s, z4.s\n" - "and z7.d, z15.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z13.s, z13.s, z5.s\n" + "and z6.d, z14.d, z2.d\n" + "and z7.d, z15.d, z3.d\n" "and z4.d, z16.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" "sqadd z14.s, z14.s, z6.s\n" - "and z5.d, z17.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z15.s, z15.s, z7.s\n" - "and z6.d, z18.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z16.s, z16.s, z4.s\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" "and z7.d, z19.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z7.s, z7.s, #0x1f\n" "sqadd z17.s, z17.s, z5.s\n" - "and z4.d, z20.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z18.s, z18.s, z6.s\n" - "and z5.d, z21.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" + "and z4.d, z20.d, z0.d\n" + "and z5.d, z21.d, z1.d\n" "and z6.d, z22.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z20.s, z20.s, z4.s\n" - "and z7.d, z23.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z21.s, z21.s, z5.s\n" "sqadd z22.s, z22.s, z6.s\n" + "and z7.d, z23.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" "sqadd z23.s, z23.s, z7.s\n" "51:" // Height 4: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" @@ -1518,32 +1488,27 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "sdot z25.s, z7.b, z4.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" @@ -1635,22 +1600,17 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "add x21, x21, #0x10\n" - "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" @@ -1746,13 +1706,8 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" "61:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 56b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1831,63 +1786,63 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" "tbz %x[flags], #5, 64f\n" "and z4.d, z8.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" "and z6.d, z10.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "and z7.d, z11.d, z3.d\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z8.s, z8.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" - "and z4.d, z12.d, z0.d\n" "sqadd z9.s, z9.s, z5.s\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z10.s, z10.s, z6.s\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z12.d, z0.d\n" "and z5.d, z13.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" "sqadd z11.s, z11.s, z7.s\n" - "and z6.d, z14.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z12.s, z12.s, z4.s\n" - "and z7.d, z15.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z13.s, z13.s, z5.s\n" + "and z6.d, z14.d, z2.d\n" + "and z7.d, z15.d, z3.d\n" "and z4.d, z16.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" "sqadd z14.s, z14.s, z6.s\n" - "and z5.d, z17.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z15.s, z15.s, z7.s\n" - "and z6.d, z18.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z16.s, z16.s, z4.s\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" "and z7.d, z19.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z7.s, z7.s, #0x1f\n" "sqadd z17.s, z17.s, z5.s\n" - "and z4.d, z20.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z18.s, z18.s, z6.s\n" - "and z5.d, z21.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" + "and z4.d, z20.d, z0.d\n" + "and z5.d, z21.d, z1.d\n" "and z6.d, z22.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z20.s, z20.s, z4.s\n" - "and z7.d, z23.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z21.s, z21.s, z5.s\n" - "and z4.d, z24.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z22.s, z22.s, z6.s\n" + "and z7.d, z23.d, z3.d\n" + "and z4.d, z24.d, z0.d\n" "and z5.d, z25.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" "sqadd z23.s, z23.s, z7.s\n" - "and z6.d, z26.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z24.s, z24.s, z4.s\n" + "sqadd z25.s, z25.s, z5.s\n" + "and z6.d, z26.d, z2.d\n" "and z7.d, z27.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z25.s, z25.s, z5.s\n" "sqadd z26.s, z26.s, z6.s\n" "sqadd z27.s, z27.s, z7.s\n" "64:" // Height 5: no shift correction @@ -2082,37 +2037,31 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" "add x20, x20, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" "sdot z28.s, z6.b, z5.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "sdot z25.s, z7.b, z4.b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "sdot z29.s, z7.b, z5.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x20, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" @@ -2218,25 +2167,19 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" - "add x20, x20, #0x10\n" - "sdot z17.s, z7.b, z2.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" "sdot z28.s, z6.b, z5.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" "sdot z29.s, z7.b, z5.b[0]\n" @@ -2347,14 +2290,8 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z27.s, z7.b, z4.b[3]\n" "sdot z31.s, z7.b, z5.b[3]\n" "74:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 69b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -2442,73 +2379,73 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n" "tbz %x[flags], #5, 77f\n" "and z4.d, z8.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" "and z6.d, z10.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "and z7.d, z11.d, z3.d\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z8.s, z8.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" - "and z4.d, z12.d, z0.d\n" "sqadd z9.s, z9.s, z5.s\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z10.s, z10.s, z6.s\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z12.d, z0.d\n" "and z5.d, z13.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" "sqadd z11.s, z11.s, z7.s\n" - "and z6.d, z14.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z12.s, z12.s, z4.s\n" - "and z7.d, z15.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z13.s, z13.s, z5.s\n" + "and z6.d, z14.d, z2.d\n" + "and z7.d, z15.d, z3.d\n" "and z4.d, z16.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" "sqadd z14.s, z14.s, z6.s\n" - "and z5.d, z17.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z15.s, z15.s, z7.s\n" - "and z6.d, z18.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z16.s, z16.s, z4.s\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" "and z7.d, z19.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z7.s, z7.s, #0x1f\n" "sqadd z17.s, z17.s, z5.s\n" - "and z4.d, z20.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z18.s, z18.s, z6.s\n" - "and z5.d, z21.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" + "and z4.d, z20.d, z0.d\n" + "and z5.d, z21.d, z1.d\n" "and z6.d, z22.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z20.s, z20.s, z4.s\n" - "and z7.d, z23.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z21.s, z21.s, z5.s\n" - "and z4.d, z24.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z22.s, z22.s, z6.s\n" + "and z7.d, z23.d, z3.d\n" + "and z4.d, z24.d, z0.d\n" "and z5.d, z25.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" "sqadd z23.s, z23.s, z7.s\n" - "and z6.d, z26.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z24.s, z24.s, z4.s\n" - "and z7.d, z27.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z25.s, z25.s, z5.s\n" + "and z6.d, z26.d, z2.d\n" + "and z7.d, z27.d, z3.d\n" "and z4.d, z28.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" "sqadd z26.s, z26.s, z6.s\n" - "and z5.d, z29.d, z1.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z27.s, z27.s, z7.s\n" - "and z6.d, z30.d, z2.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z28.s, z28.s, z4.s\n" + "and z5.d, z29.d, z1.d\n" + "and z6.d, z30.d, z2.d\n" "and z7.d, z31.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z7.s, z7.s, #0x1f\n" "sqadd z29.s, z29.s, z5.s\n" "sqadd z30.s, z30.s, z6.s\n" @@ -2665,4 +2602,4 @@ void sve_hybrid_s8qs_dot_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp new file mode 100644 index 0000000000..2b7ad8bf4b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_s8qs_mmla_6x4VL( ARGLIST ); + +class cls_sve_hybrid_s8qs_mmla_6x4VL +{ +public: + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 49.98 }; + case CPUModel::A510: + return { 22.62 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_s8qs_mmla_6x4VL; + cls_sve_hybrid_s8qs_mmla_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp new file mode 100644 index 0000000000..6aba002706 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp @@ -0,0 +1,2431 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_s8qs_mmla_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base +) +{ + struct KernelArgs { + const int32_t *multiplier_ptr = {}; + const int32_t *shift_ptr = {}; + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->per_channel_requant) { + flags |= 0x10; + ka.multiplier_ptr=qp->per_channel_muls + col_base; + ka.shift_ptr=qp->per_channel_right_shifts + col_base; + } + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 66f\n" + "cmp %x[M], #0x4\n" + "bgt 53f\n" + "beq 40f\n" + "cmp %x[M], #0x2\n" + "bgt 27f\n" + "beq 14f\n" + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "2:" // Height 1: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x10\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "3:" // Height 1: setup done + "mov x27, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x26, #0x10\n" + "ble 8f\n" + "7:" // Height 1: Multiply loop: Main loop head + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "sub x26, x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "cmp x26, #0x10\n" + "add x25, x25, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "bgt 7b\n" + "8:" // Height 1: Multiply loop: Single iteration only + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "subs x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ble 9f\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "9:" // Height 1: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 4b\n" + "uzp1 z8.d, z8.d, z12.d\n" + "ld1w { z0.s }, p2/Z, [x11]\n" + "uzp1 z9.d, z9.d, z13.d\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "uzp1 z10.d, z10.d, z14.d\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "uzp1 z11.d, z11.d, z15.d\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "mov z15.d, z8.d\n" + "add z9.s, z9.s, z1.s\n" + "add z15.s, z15.s, z0.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "tbz %x[flags], #4, 10f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 11f\n" + "10:" // Height 1: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" + "mov z1.d, z0.d\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "11:" // Height 1: parameters loaded + ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + "tbz %x[flags], #5, 12f\n" + "and z4.d, z15.d, z0.d\n" + "and z5.d, z9.d, z1.d\n" + "and z6.d, z10.d, z2.d\n" + "and z7.d, z11.d, z3.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z15.s, z15.s, z4.s\n" + "sqadd z9.s, z9.s, z5.s\n" + "sqadd z10.s, z10.s, z6.s\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z11.s, z11.s, z7.s\n" + "12:" // Height 1: no shift correction + ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "add x24, %x[qp], %[minval]\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" + "add z15.s, z15.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "uzp1 z15.h, z15.h, z9.h\n" + "uzp1 z9.h, z10.h, z11.h\n" + "uzp1 z15.b, z15.b, z9.b\n" + "st1b { z15.b }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "13:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 80f\n" + "14:" // Height 2 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "15:" // Height 2: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x10\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "16:" // Height 2: setup done + "mov x27, #0x0\n" + "17:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 18f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 19f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 19f\n" + "18:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "19:" // Height 2: input setup done + "cmp x26, #0x10\n" + "ble 21f\n" + "20:" // Height 2: Multiply loop: Main loop head + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "cmp x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x24, x24, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "bgt 20b\n" + "21:" // Height 2: Multiply loop: Single iteration only + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ble 22f\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "22:" // Height 2: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 17b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "ld1w { z0.s }, p2/Z, [x11]\n" + "add x23, x9, x19\n" + "uzp1 z12.d, z9.d, z13.d\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "uzp2 z9.d, z9.d, z13.d\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "mov z15.d, z7.d\n" + "add z15.s, z15.s, z0.s\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "tbz %x[flags], #4, 23f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 24f\n" + "23:" // Height 2: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" + "mov z1.d, z0.d\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "24:" // Height 2: parameters loaded + ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" + ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + "tbz %x[flags], #5, 25f\n" + "and z4.d, z15.d, z0.d\n" + "and z5.d, z12.d, z1.d\n" + "and z6.d, z13.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z15.s, z15.s, z4.s\n" + "sqadd z12.s, z12.s, z5.s\n" + "sqadd z13.s, z13.s, z6.s\n" + "and z7.d, z14.d, z3.d\n" + "and z4.d, z8.d, z0.d\n" + "and z5.d, z9.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z14.s, z14.s, z7.s\n" + "sqadd z8.s, z8.s, z4.s\n" + "sqadd z9.s, z9.s, z5.s\n" + "and z6.d, z10.d, z2.d\n" + "and z7.d, z11.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "sqadd z11.s, z11.s, z7.s\n" + "25:" // Height 2: no shift correction + ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + "add x24, %x[qp], %[minval]\n" + ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add z15.s, z15.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z8.s, z8.s, z4.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "uzp1 z15.h, z15.h, z12.h\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "uzp1 z15.b, z15.b, z12.b\n" + "st1b { z15.b }, p1, [x9]\n" + "add z9.s, z9.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x23]\n" + "26:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 80f\n" + "27:" // Height 3 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "28:" // Height 3: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x10\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "29:" // Height 3: setup done + "mov x27, #0x0\n" + "30:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 32f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 32f\n" + "31:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "32:" // Height 3: input setup done + "cmp x26, #0x10\n" + "ble 34f\n" + "33:" // Height 3: Multiply loop: Main loop head + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "cmp x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "add x23, x23, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "bgt 33b\n" + "34:" // Height 3: Multiply loop: Single iteration only + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ble 35f\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "35:" // Height 3: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 30b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "ld1w { z0.s }, p2/Z, [x11]\n" + "add x23, x9, x19\n" + "uzp1 z12.d, z9.d, z13.d\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "uzp2 z9.d, z9.d, z13.d\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" + "uzp1 z13.d, z10.d, z14.d\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "uzp2 z10.d, z10.d, z14.d\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z16.d, z16.d, z20.d\n" + "uzp1 z17.d, z17.d, z21.d\n" + "uzp1 z18.d, z18.d, z22.d\n" + "uzp1 z19.d, z19.d, z23.d\n" + "mov z23.d, z7.d\n" + "add z23.s, z23.s, z0.s\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "tbz %x[flags], #4, 36f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 37f\n" + "36:" // Height 3: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" + "mov z1.d, z0.d\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "37:" // Height 3: parameters loaded + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" + ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + "tbz %x[flags], #5, 38f\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z12.d, z1.d\n" + "and z6.d, z13.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z23.s, z23.s, z4.s\n" + "sqadd z12.s, z12.s, z5.s\n" + "sqadd z13.s, z13.s, z6.s\n" + "and z7.d, z14.d, z3.d\n" + "and z4.d, z8.d, z0.d\n" + "and z5.d, z9.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z14.s, z14.s, z7.s\n" + "sqadd z8.s, z8.s, z4.s\n" + "sqadd z9.s, z9.s, z5.s\n" + "and z6.d, z10.d, z2.d\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z16.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "sqadd z11.s, z11.s, z7.s\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" + "and z7.d, z19.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "sqadd z19.s, z19.s, z7.s\n" + "38:" // Height 3: no shift correction + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + "add x24, %x[qp], %[minval]\n" + ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add z23.s, z23.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z8.s, z8.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "uzp1 z23.h, z23.h, z12.h\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "uzp1 z23.b, z23.b, z12.b\n" + "st1b { z23.b }, p1, [x9]\n" + "add z9.s, z9.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x23]\n" + "add z18.s, z18.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x22]\n" + "39:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 80f\n" + "40:" // Height 4 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "41:" // Height 4: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x10\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "42:" // Height 4: setup done + "mov x27, #0x0\n" + "43:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 44f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 45f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 45f\n" + "44:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "45:" // Height 4: input setup done + "cmp x26, #0x10\n" + "ble 47f\n" + "46:" // Height 4: Multiply loop: Main loop head + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "cmp x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "add x23, x23, #0x10\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "add x22, x22, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "bgt 46b\n" + "47:" // Height 4: Multiply loop: Single iteration only + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ble 48f\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "48:" // Height 4: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 43b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "ld1w { z0.s }, p2/Z, [x11]\n" + "add x23, x9, x19\n" + "uzp1 z12.d, z9.d, z13.d\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "uzp2 z9.d, z9.d, z13.d\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" + "uzp1 z13.d, z10.d, z14.d\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "add x21, x22, x19\n" + "uzp2 z10.d, z10.d, z14.d\n" + "addvl x11, x11, #4\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "mov z23.d, z7.d\n" + "add z23.s, z23.s, z0.s\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z15.s, z15.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "tbz %x[flags], #4, 49f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 50f\n" + "49:" // Height 4: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" + "mov z1.d, z0.d\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "50:" // Height 4: parameters loaded + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" + ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57694 // sqrdmulh z20.s, z20.s, z5.s\n" + ".inst 0x04a676b5 // sqrdmulh z21.s, z21.s, z6.s\n" + ".inst 0x04a776d6 // sqrdmulh z22.s, z22.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + "tbz %x[flags], #5, 51f\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z12.d, z1.d\n" + "and z6.d, z13.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z23.s, z23.s, z4.s\n" + "sqadd z12.s, z12.s, z5.s\n" + "sqadd z13.s, z13.s, z6.s\n" + "and z7.d, z14.d, z3.d\n" + "and z4.d, z8.d, z0.d\n" + "and z5.d, z9.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z14.s, z14.s, z7.s\n" + "sqadd z8.s, z8.s, z4.s\n" + "sqadd z9.s, z9.s, z5.s\n" + "and z6.d, z10.d, z2.d\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z15.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "sqadd z11.s, z11.s, z7.s\n" + "sqadd z15.s, z15.s, z4.s\n" + "and z5.d, z20.d, z1.d\n" + "and z6.d, z21.d, z2.d\n" + "and z7.d, z22.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "sqadd z22.s, z22.s, z7.s\n" + "and z4.d, z16.d, z0.d\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "and z7.d, z19.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "51:" // Height 4: no shift correction + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + "add x24, %x[qp], %[minval]\n" + ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add z23.s, z23.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z8.s, z8.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "uzp1 z23.h, z23.h, z12.h\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "uzp1 z23.b, z23.b, z12.b\n" + "st1b { z23.b }, p1, [x9]\n" + "add z9.s, z9.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x23]\n" + "add z21.s, z21.s, z4.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "add z22.s, z22.s, z4.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "uzp1 z15.h, z15.h, z20.h\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "add z17.s, z17.s, z4.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "add z18.s, z18.s, z4.s\n" + "uzp1 z15.b, z15.b, z20.b\n" + "st1b { z15.b }, p1, [x22]\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "52:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" + "b 80f\n" + "53:" // Height 5 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "54:" // Height 5: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x10\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "55:" // Height 5: setup done + "mov x27, #0x0\n" + "56:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 57f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 58f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 58f\n" + "57:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "58:" // Height 5: input setup done + "cmp x26, #0x10\n" + "ble 60f\n" + "59:" // Height 5: Multiply loop: Main loop head + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "sub x26, x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "cmp x26, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + "add x25, x25, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "add x24, x24, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x23, x23, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "add x22, x22, #0x10\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "bgt 59b\n" + "60:" // Height 5: Multiply loop: Single iteration only + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "subs x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ble 61f\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "61:" // Height 5: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 56b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "ld1w { z0.s }, p2/Z, [x11]\n" + "add x23, x9, x19\n" + "uzp1 z12.d, z9.d, z13.d\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "uzp2 z9.d, z9.d, z13.d\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" + "uzp1 z13.d, z10.d, z14.d\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "add x21, x22, x19\n" + "uzp2 z10.d, z10.d, z14.d\n" + "add x20, x21, x19\n" + "uzp1 z14.d, z11.d, z15.d\n" + "addvl x11, x11, #4\n" + "uzp2 z11.d, z11.d, z15.d\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z24.d, z24.d, z28.d\n" + "uzp1 z25.d, z25.d, z29.d\n" + "uzp1 z26.d, z26.d, z30.d\n" + "uzp1 z27.d, z27.d, z31.d\n" + "mov z31.d, z7.d\n" + "add z31.s, z31.s, z0.s\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z15.s, z15.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "tbz %x[flags], #4, 62f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 63f\n" + "62:" // Height 5: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" + "mov z1.d, z0.d\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "63:" // Height 5: parameters loaded + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" + ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57694 // sqrdmulh z20.s, z20.s, z5.s\n" + ".inst 0x04a676b5 // sqrdmulh z21.s, z21.s, z6.s\n" + ".inst 0x04a776d6 // sqrdmulh z22.s, z22.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" + ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" + "tbz %x[flags], #5, 64f\n" + "and z4.d, z31.d, z0.d\n" + "and z5.d, z12.d, z1.d\n" + "and z6.d, z13.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z31.s, z31.s, z4.s\n" + "sqadd z12.s, z12.s, z5.s\n" + "sqadd z13.s, z13.s, z6.s\n" + "and z7.d, z14.d, z3.d\n" + "and z4.d, z8.d, z0.d\n" + "and z5.d, z9.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z14.s, z14.s, z7.s\n" + "sqadd z8.s, z8.s, z4.s\n" + "sqadd z9.s, z9.s, z5.s\n" + "and z6.d, z10.d, z2.d\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z15.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "sqadd z11.s, z11.s, z7.s\n" + "sqadd z15.s, z15.s, z4.s\n" + "and z5.d, z20.d, z1.d\n" + "and z6.d, z21.d, z2.d\n" + "and z7.d, z22.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "sqadd z22.s, z22.s, z7.s\n" + "and z4.d, z16.d, z0.d\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "and z7.d, z19.d, z3.d\n" + "and z4.d, z24.d, z0.d\n" + "and z5.d, z25.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z24.s, z24.s, z4.s\n" + "sqadd z25.s, z25.s, z5.s\n" + "and z6.d, z26.d, z2.d\n" + "and z7.d, z27.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z26.s, z26.s, z6.s\n" + "sqadd z27.s, z27.s, z7.s\n" + "64:" // Height 5: no shift correction + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + "add x24, %x[qp], %[minval]\n" + ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add z31.s, z31.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z8.s, z8.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "uzp1 z31.h, z31.h, z12.h\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "uzp1 z31.b, z31.b, z12.b\n" + "st1b { z31.b }, p1, [x9]\n" + "add z9.s, z9.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x23]\n" + "add z21.s, z21.s, z4.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "add z22.s, z22.s, z4.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "uzp1 z15.h, z15.h, z20.h\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "add z17.s, z17.s, z4.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "add z18.s, z18.s, z4.s\n" + "uzp1 z15.b, z15.b, z20.b\n" + "st1b { z15.b }, p1, [x22]\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "add z26.s, z26.s, z4.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "add z27.s, z27.s, z4.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x20]\n" + "65:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 54b\n" + "b 80f\n" + "66:" // Height 6 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x20, #0x6\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "67:" // Height 6: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x10\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "68:" // Height 6: setup done + "mov x27, #0x0\n" + "69:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 70f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 71f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 71f\n" + "70:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "71:" // Height 6: input setup done + "cmp x26, #0x10\n" + "ble 73f\n" + "72:" // Height 6: Multiply loop: Main loop head + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "cmp x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x25, x25, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + "add x24, x24, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1rqb { z6.b }, p0/Z, [x20]\n" + "add x23, x23, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "add x20, x20, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "bgt 72b\n" + "73:" // Height 6: Multiply loop: Single iteration only + "ld1b { z7.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1rqb { z6.b }, p0/Z, [x20]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ble 74f\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "74:" // Height 6: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 69b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "ld1w { z0.s }, p2/Z, [x11]\n" + "add x23, x9, x19\n" + "uzp1 z12.d, z9.d, z13.d\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "uzp2 z9.d, z9.d, z13.d\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" + "uzp1 z13.d, z10.d, z14.d\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "add x21, x22, x19\n" + "uzp2 z10.d, z10.d, z14.d\n" + "add x20, x21, x19\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x19, x20, x19\n" + "uzp2 z11.d, z11.d, z15.d\n" + "addvl x11, x11, #4\n" + "uzp1 z15.d, z16.d, z20.d\n" + "uzp2 z16.d, z16.d, z20.d\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z23.d, z24.d, z28.d\n" + "uzp2 z24.d, z24.d, z28.d\n" + "uzp1 z28.d, z25.d, z29.d\n" + "uzp2 z25.d, z25.d, z29.d\n" + "uzp1 z29.d, z26.d, z30.d\n" + "uzp2 z26.d, z26.d, z30.d\n" + "uzp1 z30.d, z27.d, z31.d\n" + "uzp2 z27.d, z27.d, z31.d\n" + "mov z31.d, z7.d\n" + "add z31.s, z31.s, z0.s\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z15.s, z15.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z28.s, z28.s, z1.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "tbz %x[flags], #4, 75f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 76f\n" + "75:" // Height 6: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" + "mov z1.d, z0.d\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "76:" // Height 6: parameters loaded + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" + ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57694 // sqrdmulh z20.s, z20.s, z5.s\n" + ".inst 0x04a676b5 // sqrdmulh z21.s, z21.s, z6.s\n" + ".inst 0x04a776d6 // sqrdmulh z22.s, z22.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a5779c // sqrdmulh z28.s, z28.s, z5.s\n" + ".inst 0x04a677bd // sqrdmulh z29.s, z29.s, z6.s\n" + ".inst 0x04a777de // sqrdmulh z30.s, z30.s, z7.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" + ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" + "tbz %x[flags], #5, 77f\n" + "and z4.d, z31.d, z0.d\n" + "and z5.d, z12.d, z1.d\n" + "and z6.d, z13.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z31.s, z31.s, z4.s\n" + "sqadd z12.s, z12.s, z5.s\n" + "sqadd z13.s, z13.s, z6.s\n" + "and z7.d, z14.d, z3.d\n" + "and z4.d, z8.d, z0.d\n" + "and z5.d, z9.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z14.s, z14.s, z7.s\n" + "sqadd z8.s, z8.s, z4.s\n" + "sqadd z9.s, z9.s, z5.s\n" + "and z6.d, z10.d, z2.d\n" + "and z7.d, z11.d, z3.d\n" + "and z4.d, z15.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "sqadd z11.s, z11.s, z7.s\n" + "sqadd z15.s, z15.s, z4.s\n" + "and z5.d, z20.d, z1.d\n" + "and z6.d, z21.d, z2.d\n" + "and z7.d, z22.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "sqadd z22.s, z22.s, z7.s\n" + "and z4.d, z16.d, z0.d\n" + "and z5.d, z17.d, z1.d\n" + "and z6.d, z18.d, z2.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "and z7.d, z19.d, z3.d\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z28.d, z1.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "sqadd z28.s, z28.s, z5.s\n" + "and z6.d, z29.d, z2.d\n" + "and z7.d, z30.d, z3.d\n" + "and z4.d, z24.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z29.s, z29.s, z6.s\n" + "sqadd z30.s, z30.s, z7.s\n" + "sqadd z24.s, z24.s, z4.s\n" + "and z5.d, z25.d, z1.d\n" + "and z6.d, z26.d, z2.d\n" + "and z7.d, z27.d, z3.d\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z25.s, z25.s, z5.s\n" + "sqadd z26.s, z26.s, z6.s\n" + "sqadd z27.s, z27.s, z7.s\n" + "77:" // Height 6: no shift correction + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" + ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + "add x24, %x[qp], %[minval]\n" + ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add z31.s, z31.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z8.s, z8.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "uzp1 z31.h, z31.h, z12.h\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "uzp1 z31.b, z31.b, z12.b\n" + "st1b { z31.b }, p1, [x9]\n" + "add z9.s, z9.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x23]\n" + "add z21.s, z21.s, z4.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "add z22.s, z22.s, z4.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "uzp1 z15.h, z15.h, z20.h\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "add z17.s, z17.s, z4.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "add z18.s, z18.s, z4.s\n" + "uzp1 z15.b, z15.b, z20.b\n" + "st1b { z15.b }, p1, [x22]\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x4482883c // srshl z28.s, p2/M, z28.s, z1.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z4.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + ".inst 0x4482885d // srshl z29.s, p2/M, z29.s, z2.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "add z29.s, z29.s, z4.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + ".inst 0x4482887e // srshl z30.s, p2/M, z30.s, z3.s\n" + "uzp1 z23.h, z23.h, z28.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "add z30.s, z30.s, z4.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" + "uzp1 z28.h, z29.h, z30.h\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "uzp1 z23.b, z23.b, z28.b\n" + "st1b { z23.b }, p1, [x20]\n" + "add z26.s, z26.s, z4.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "add z27.s, z27.s, z4.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x19]\n" + "78:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 67b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 80f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 79f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "79:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "80:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp index 37258978d3..b8ca7c5456 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp @@ -22,9 +22,10 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -42,7 +43,8 @@ void sve_hybrid_s8s32_dot_6x4VL( ARGLIST ); class cls_sve_hybrid_s8s32_dot_6x4VL { public: - typedef int8_t operand_type; + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; typedef int32_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,7 +70,36 @@ public: return true; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.56 }; + case CPUModel::A510: + return { 20.92 }; + case CPUModel::V1: + return { 62.24 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.60, 15.53, 0.62 }; + case CPUModel::A510: + return { 22.77, 3.90, 0.47 }; + case CPUModel::V1: + return { 62.97, 19.14, 0.92 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_s8s32_dot_6x4VL; @@ -80,4 +111,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp new file mode 100644 index 0000000000..e0fea96ef3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp @@ -0,0 +1,1033 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_s8s32_dot_6x4VL_a64fx ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const int32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + "ptrue p4.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 51f\n" + "cmp %x[M], #0x4\n" + "bgt 41f\n" + "beq 31f\n" + "cmp %x[M], #0x2\n" + "bgt 21f\n" + "beq 11f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 3f\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "b 4f\n" + "3:" // Height 1: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "4:" // Height 1: setup done + "mov x27, #0x0\n" + "5:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 9f\n" + "8:" // Height 1: Multiply loop: Main loop + "sdot z8.s, z6.b, z0.b\n" + "sdot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x25, x25, #0x4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z11.s, z7.b, z0.b\n" + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 8b\n" + "9:" // Height 1: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "sdot z8.s, z6.b, z0.b\n" + "sdot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z11.s, z7.b, z0.b\n" + "addvl x9, x9, #4\n" + "bne 5b\n" + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "10:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 62f\n" + "11:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "12:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 13f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "b 14f\n" + "13:" // Height 2: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "14:" // Height 2: setup done + "mov x27, #0x0\n" + "15:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 17f\n" + "16:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "17:" // Height 2: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 19f\n" + "18:" // Height 2: Multiply loop: Main loop + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x4\n" + "sdot z9.s, z7.b, z0.b\n" + "sdot z13.s, z7.b, z1.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "subs x26, x26, #0x4\n" + "add x24, x24, #0x4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 18b\n" + "19:" // Height 2: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b\n" + "sdot z13.s, z7.b, z1.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "addvl x9, x9, #4\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "bne 15b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "20:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 12b\n" + "b 62f\n" + "21:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "22:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 23f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "b 24f\n" + "23:" // Height 3: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "24:" // Height 3: setup done + "mov x27, #0x0\n" + "25:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 26f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 27f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 27f\n" + "26:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "27:" // Height 3: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 29f\n" + "28:" // Height 3: Multiply loop: Main loop + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x4\n" + "sdot z13.s, z7.b, z1.b\n" + "sdot z17.s, z7.b, z2.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x23, x23, #0x4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z11.s, z7.b, z0.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "sdot z15.s, z7.b, z1.b\n" + "sdot z19.s, z7.b, z2.b\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 28b\n" + "29:" // Height 3: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "sdot z13.s, z7.b, z1.b\n" + "sdot z17.s, z7.b, z2.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "sdot z19.s, z7.b, z2.b\n" + "bne 25b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "30:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 22b\n" + "b 62f\n" + "31:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "32:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 33f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "b 34f\n" + "33:" // Height 4: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "34:" // Height 4: setup done + "mov x27, #0x0\n" + "35:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 37f\n" + "36:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "37:" // Height 4: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 39f\n" + "38:" // Height 4: Multiply loop: Main loop + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z20.s, z6.b, z3.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x4\n" + "sdot z9.s, z7.b, z0.b\n" + "sdot z13.s, z7.b, z1.b\n" + "add x23, x23, #0x4\n" + "add x22, x22, #0x4\n" + "sdot z17.s, z7.b, z2.b\n" + "sdot z21.s, z7.b, z3.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z22.s, z6.b, z3.b\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "sdot z19.s, z7.b, z2.b\n" + "sdot z23.s, z7.b, z3.b\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 38b\n" + "39:" // Height 4: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z20.s, z6.b, z3.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "sdot z9.s, z7.b, z0.b\n" + "sdot z13.s, z7.b, z1.b\n" + "sdot z17.s, z7.b, z2.b\n" + "sdot z21.s, z7.b, z3.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z22.s, z6.b, z3.b\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "sdot z19.s, z7.b, z2.b\n" + "sdot z23.s, z7.b, z3.b\n" + "bne 35b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "40:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 32b\n" + "b 62f\n" + "41:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "42:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" + "b 44f\n" + "43:" // Height 5: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "44:" // Height 5: setup done + "mov x27, #0x0\n" + "45:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 47f\n" + "46:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "47:" // Height 5: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 49f\n" + "48:" // Height 5: Multiply loop: Main loop + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z20.s, z6.b, z3.b\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "sdot z24.s, z6.b, z4.b\n" + "sdot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x4\n" + "sdot z13.s, z7.b, z1.b\n" + "sdot z17.s, z7.b, z2.b\n" + "add x21, x21, #0x4\n" + "sdot z21.s, z7.b, z3.b\n" + "sdot z25.s, z7.b, z4.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z22.s, z6.b, z3.b\n" + "sdot z26.s, z6.b, z4.b\n" + "sdot z11.s, z7.b, z0.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "sdot z15.s, z7.b, z1.b\n" + "sdot z19.s, z7.b, z2.b\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "sdot z23.s, z7.b, z3.b\n" + "sdot z27.s, z7.b, z4.b\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 48b\n" + "49:" // Height 5: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z20.s, z6.b, z3.b\n" + "cmp x27, x19\n" + "sdot z24.s, z6.b, z4.b\n" + "sdot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z7.b, z1.b\n" + "sdot z17.s, z7.b, z2.b\n" + "sdot z21.s, z7.b, z3.b\n" + "sdot z25.s, z7.b, z4.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z22.s, z6.b, z3.b\n" + "sdot z26.s, z6.b, z4.b\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "sdot z19.s, z7.b, z2.b\n" + "sdot z23.s, z7.b, z3.b\n" + "sdot z27.s, z7.b, z4.b\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x20]\n" + "st1w { z25.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x20, #3, MUL VL]\n" + "50:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 42b\n" + "b 62f\n" + "51:" // Height 6 + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "52:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 53f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" + "ld1w { z28.s }, p3/Z, [x19]\n" + "ld1w { z29.s }, p2/Z, [x19, #1, MUL VL]\n" + "ld1w { z30.s }, p1/Z, [x19, #2, MUL VL]\n" + "ld1w { z31.s }, p0/Z, [x19, #3, MUL VL]\n" + "b 54f\n" + "53:" // Height 6: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "54:" // Height 6: setup done + "mov x27, #0x0\n" + "55:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 56f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 57f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 57f\n" + "56:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "57:" // Height 6: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 59f\n" + "58:" // Height 6: Multiply loop: Main loop + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z20.s, z6.b, z3.b\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "sdot z24.s, z6.b, z4.b\n" + "sdot z28.s, z6.b, z5.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x4\n" + "sdot z9.s, z7.b, z0.b\n" + "sdot z13.s, z7.b, z1.b\n" + "add x21, x21, #0x4\n" + "add x20, x20, #0x4\n" + "sdot z17.s, z7.b, z2.b\n" + "sdot z21.s, z7.b, z3.b\n" + "sdot z25.s, z7.b, z4.b\n" + "sdot z29.s, z7.b, z5.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z22.s, z6.b, z3.b\n" + "sdot z26.s, z6.b, z4.b\n" + "sdot z30.s, z6.b, z5.b\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "sdot z19.s, z7.b, z2.b\n" + "sdot z23.s, z7.b, z3.b\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "sdot z27.s, z7.b, z4.b\n" + "sdot z31.s, z7.b, z5.b\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 58b\n" + "59:" // Height 6: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "sdot z8.s, z6.b, z0.b\n" + "sdot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "sdot z16.s, z6.b, z2.b\n" + "sdot z20.s, z6.b, z3.b\n" + "cmp x27, x19\n" + "sdot z24.s, z6.b, z4.b\n" + "sdot z28.s, z6.b, z5.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b\n" + "sdot z13.s, z7.b, z1.b\n" + "sdot z17.s, z7.b, z2.b\n" + "sdot z21.s, z7.b, z3.b\n" + "sdot z25.s, z7.b, z4.b\n" + "sdot z29.s, z7.b, z5.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z10.s, z6.b, z0.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z18.s, z6.b, z2.b\n" + "sdot z22.s, z6.b, z3.b\n" + "sdot z26.s, z6.b, z4.b\n" + "sdot z30.s, z6.b, z5.b\n" + "sdot z11.s, z7.b, z0.b\n" + "sdot z15.s, z7.b, z1.b\n" + "sdot z19.s, z7.b, z2.b\n" + "sdot z23.s, z7.b, z3.b\n" + "sdot z27.s, z7.b, z4.b\n" + "sdot z31.s, z7.b, z5.b\n" + "bne 55b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x20]\n" + "st1w { z25.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x20, #3, MUL VL]\n" + "st1w { z28.s }, p3, [x19]\n" + "st1w { z29.s }, p2, [x19, #1, MUL VL]\n" + "st1w { z30.s }, p1, [x19, #2, MUL VL]\n" + "st1w { z31.s }, p0, [x19, #3, MUL VL]\n" + "60:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 52b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 62f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 61f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "61:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "62:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp index 9cddee941e..dc5b7a33f4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp @@ -137,13 +137,12 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "cmp x26, #0x10\n" + "add x25, x25, #0x10\n" "sdot z10.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" - "prfm pldl1keep, [x25, #0x80]\n" "sdot z11.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" @@ -178,7 +177,6 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" @@ -217,9 +215,8 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z10.s, z6.b, z0.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "10:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 5b\n" "st1w { z8.s }, p4, [x28]\n" @@ -296,16 +293,14 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "cmp x26, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" @@ -356,9 +351,7 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z13.s, z7.b, z1.b[0]\n" @@ -413,10 +406,8 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" "21:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 16b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -511,21 +502,18 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "cmp x26, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" "sdot z17.s, z7.b, z2.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -590,12 +578,9 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" @@ -665,11 +650,8 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" "32:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 27b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -781,26 +763,22 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" + "add x22, x22, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "prfm pldl1keep, [x22, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -879,19 +857,15 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "add x22, x22, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z21.s, z7.b, z3.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" @@ -972,12 +946,8 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" "43:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 38b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1106,32 +1076,27 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "sdot z25.s, z7.b, z4.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" @@ -1223,22 +1188,17 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "add x21, x21, #0x10\n" - "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" @@ -1334,13 +1294,8 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" "54:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 49b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1489,37 +1444,31 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" "add x20, x20, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" "sdot z28.s, z6.b, z5.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "sdot z25.s, z7.b, z4.b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "sdot z29.s, z7.b, z5.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x20, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" @@ -1625,25 +1574,19 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" - "add x20, x20, #0x10\n" - "sdot z17.s, z7.b, z2.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" "sdot z28.s, z6.b, z5.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" "sdot z29.s, z7.b, z5.b[0]\n" @@ -1754,14 +1697,8 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z27.s, z7.b, z4.b[3]\n" "sdot z31.s, z7.b, z5.b[3]\n" "65:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 60b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp new file mode 100644 index 0000000000..b88ef14f25 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const int32_t *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_s8s32_mmla_6x4VL( ARGLIST ); + +class cls_sve_hybrid_s8s32_mmla_6x4VL +{ +public: + typedef int8_t lhs_operand_type; + typedef int8_t rhs_operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 54.42 }; + case CPUModel::A510: + return { 24.21 }; + case CPUModel::V1: + return { 104.92 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 54.99, 15.37, 0.62 }; + case CPUModel::A510: + return { 23.87, 3.89, 0.37 }; + case CPUModel::V1: + return { 107.63, 19.24, 0.92 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_s8s32_mmla_6x4VL; + cls_sve_hybrid_s8s32_mmla_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp new file mode 100644 index 0000000000..c3abb203ca --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp @@ -0,0 +1,1675 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_s8s32_mmla_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const int32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 56f\n" + "cmp %x[M], #0x4\n" + "bgt 45f\n" + "beq 34f\n" + "cmp %x[M], #0x2\n" + "bgt 23f\n" + "beq 12f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 3f\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 4f\n" + "3:" // Height 1: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "4:" // Height 1: setup done + "mov x27, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x26, #0x10\n" + "ble 9f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "sub x26, x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "cmp x26, #0x10\n" + "add x25, x25, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "bgt 8b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "subs x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ble 10f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "10:" // Height 1: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 5b\n" + "uzp1 z8.d, z8.d, z12.d\n" + "st1w { z8.s }, p4, [x28]\n" + "uzp1 z9.d, z9.d, z13.d\n" + "uzp1 z10.d, z10.d, z14.d\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z11.d, z11.d, z15.d\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "11:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 68f\n" + "12:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "13:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 14f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 15f\n" + "14:" // Height 2: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "15:" // Height 2: setup done + "mov x27, #0x0\n" + "16:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 18f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 18f\n" + "17:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "18:" // Height 2: input setup done + "cmp x26, #0x10\n" + "ble 20f\n" + "19:" // Height 2: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "cmp x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x24, x24, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "bgt 19b\n" + "20:" // Height 2: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ble 21f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "21:" // Height 2: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 16b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x23]\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "22:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 13b\n" + "b 68f\n" + "23:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "24:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 25f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "zip1 z16.d, z17.d, z20.d\n" + "zip2 z20.d, z17.d, z20.d\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 26f\n" + "25:" // Height 3: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "26:" // Height 3: setup done + "mov x27, #0x0\n" + "27:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 28f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 29f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 29f\n" + "28:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "29:" // Height 3: input setup done + "cmp x26, #0x10\n" + "ble 31f\n" + "30:" // Height 3: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "cmp x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "add x23, x23, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "bgt 30b\n" + "31:" // Height 3: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ble 32f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "32:" // Height 3: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 27b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z16.d, z16.d, z20.d\n" + "addvl x28, x28, #4\n" + "uzp1 z17.d, z17.d, z21.d\n" + "st1w { z8.s }, p4, [x23]\n" + "uzp1 z18.d, z18.d, z22.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "uzp1 z19.d, z19.d, z23.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "33:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 24b\n" + "b 68f\n" + "34:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "35:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 36f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 37f\n" + "36:" // Height 4: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "37:" // Height 4: setup done + "mov x27, #0x0\n" + "38:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 39f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 40f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 40f\n" + "39:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "40:" // Height 4: input setup done + "cmp x26, #0x10\n" + "ble 42f\n" + "41:" // Height 4: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "cmp x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "add x23, x23, #0x10\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "add x22, x22, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "bgt 41b\n" + "42:" // Height 4: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ble 43f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "43:" // Height 4: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 38b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z15.d, z16.d, z20.d\n" + "addvl x28, x28, #4\n" + "uzp2 z16.d, z16.d, z20.d\n" + "st1w { z8.s }, p4, [x23]\n" + "uzp1 z20.d, z17.d, z21.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "uzp2 z17.d, z17.d, z21.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "uzp1 z21.d, z18.d, z22.d\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "uzp2 z18.d, z18.d, z22.d\n" + "st1w { z15.s }, p4, [x22]\n" + "uzp1 z22.d, z19.d, z23.d\n" + "st1w { z20.s }, p3, [x22, #1, MUL VL]\n" + "uzp2 z19.d, z19.d, z23.d\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x21]\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "44:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 35b\n" + "b 68f\n" + "45:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "46:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z25.s }, p4/Z, [x20]\n" + "zip1 z17.d, z18.d, z21.d\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "zip1 z24.d, z25.d, z28.d\n" + "zip2 z28.d, z25.d, z28.d\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 48f\n" + "47:" // Height 5: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "48:" // Height 5: setup done + "mov x27, #0x0\n" + "49:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 51f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 51f\n" + "50:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "51:" // Height 5: input setup done + "cmp x26, #0x10\n" + "ble 53f\n" + "52:" // Height 5: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "sub x26, x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "cmp x26, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + "add x25, x25, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "add x24, x24, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x23, x23, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "add x22, x22, #0x10\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "bgt 52b\n" + "53:" // Height 5: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "subs x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ble 54f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "54:" // Height 5: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 49b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z15.d, z16.d, z20.d\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 z16.d, z16.d, z20.d\n" + "st1w { z8.s }, p4, [x23]\n" + "addvl x28, x28, #4\n" + "uzp1 z20.d, z17.d, z21.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "uzp2 z17.d, z17.d, z21.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "uzp1 z21.d, z18.d, z22.d\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "uzp2 z18.d, z18.d, z22.d\n" + "st1w { z15.s }, p4, [x22]\n" + "uzp1 z22.d, z19.d, z23.d\n" + "st1w { z20.s }, p3, [x22, #1, MUL VL]\n" + "uzp2 z19.d, z19.d, z23.d\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "uzp1 z24.d, z24.d, z28.d\n" + "st1w { z22.s }, p1, [x22, #3, MUL VL]\n" + "uzp1 z25.d, z25.d, z29.d\n" + "st1w { z16.s }, p4, [x21]\n" + "uzp1 z26.d, z26.d, z30.d\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "uzp1 z27.d, z27.d, z31.d\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "55:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 46b\n" + "b 68f\n" + "56:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "57:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 58f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip1 z17.d, z18.d, z21.d\n" + "ld1w { z25.s }, p4/Z, [x20]\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip2 z22.d, z19.d, z22.d\n" + "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z19.d, z24.d, z23.d\n" + "ld1w { z28.s }, p4/Z, [x19]\n" + "zip2 z23.d, z24.d, z23.d\n" + "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n" + "zip1 z24.d, z25.d, z28.d\n" + "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n" + "zip2 z28.d, z25.d, z28.d\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 59f\n" + "58:" // Height 6: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "59:" // Height 6: setup done + "mov x27, #0x0\n" + "60:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 61f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 62f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 62f\n" + "61:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "62:" // Height 6: input setup done + "cmp x26, #0x10\n" + "ble 64f\n" + "63:" // Height 6: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "cmp x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x25, x25, #0x10\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + "add x24, x24, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1rqb { z6.b }, p0/Z, [x20]\n" + "add x23, x23, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "add x20, x20, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "bgt 63b\n" + "64:" // Height 6: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1rqb { z6.b }, p0/Z, [x20]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ble 65f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "65:" // Height 6: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 60b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z15.d, z16.d, z20.d\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 z16.d, z16.d, z20.d\n" + "st1w { z8.s }, p4, [x23]\n" + "add x19, x20, x19, LSL #2\n" + "uzp1 z20.d, z17.d, z21.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "addvl x28, x28, #4\n" + "uzp2 z17.d, z17.d, z21.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "uzp1 z21.d, z18.d, z22.d\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "uzp2 z18.d, z18.d, z22.d\n" + "st1w { z15.s }, p4, [x22]\n" + "uzp1 z22.d, z19.d, z23.d\n" + "st1w { z20.s }, p3, [x22, #1, MUL VL]\n" + "uzp2 z19.d, z19.d, z23.d\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "uzp1 z23.d, z24.d, z28.d\n" + "st1w { z22.s }, p1, [x22, #3, MUL VL]\n" + "uzp2 z24.d, z24.d, z28.d\n" + "st1w { z16.s }, p4, [x21]\n" + "uzp1 z28.d, z25.d, z29.d\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "uzp2 z25.d, z25.d, z29.d\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "uzp1 z29.d, z26.d, z30.d\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "uzp2 z26.d, z26.d, z30.d\n" + "st1w { z23.s }, p4, [x20]\n" + "uzp1 z30.d, z27.d, z31.d\n" + "st1w { z28.s }, p3, [x20, #1, MUL VL]\n" + "uzp2 z27.d, z27.d, z31.d\n" + "st1w { z29.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z30.s }, p1, [x20, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x19]\n" + "st1w { z25.s }, p3, [x19, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x19, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x19, #3, MUL VL]\n" + "66:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 57b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 68f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 67f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "67:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "68:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp index 3de8d178cd..c66ebedc4d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp @@ -22,9 +22,10 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -42,7 +43,8 @@ void sve_hybrid_u8qa_dot_4x4VL( ARGLIST ); class cls_sve_hybrid_u8qa_dot_4x4VL { public: - typedef uint8_t operand_type; + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; typedef uint8_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,7 +70,22 @@ public: return false; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 29.89 }; + case CPUModel::A510: + return { 17.12 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_u8qa_dot_4x4VL; @@ -80,4 +97,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp index 0bfc28776f..be6d5b901d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp @@ -158,7 +158,6 @@ void sve_hybrid_u8qa_dot_4x4VL ( "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" "cmp x24, #0x10\n" "bgt 7b\n" @@ -170,7 +169,6 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" @@ -212,9 +210,8 @@ void sve_hybrid_u8qa_dot_4x4VL ( "tbnz %x[flags], #31, 11f\n" "udot z11.s, z0.b, z15.b\n" "11:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 4b\n" "tbnz %x[flags], #31, 12f\n" @@ -251,16 +248,16 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" "tbz %x[flags], #5, 13f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" "and z7.d, z19.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" "sqadd z16.s, z16.s, z4.s\n" - "asr z7.s, z7.s, #0x1f\n" "sqadd z17.s, z17.s, z5.s\n" "sqadd z18.s, z18.s, z6.s\n" + "asr z7.s, z7.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "13:" // Height 1: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -396,9 +393,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "22:" // Height 2: Multiply loop: unique 3: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" "cmp x24, #0x10\n" "bgt 21b\n" "23:" // Height 2: Multiply loop: Single iteration only @@ -409,12 +404,10 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x22, x22, #0x10\n" - "udot z20.s, z4.b, z1.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z20.s, z4.b, z1.b[0]\n" "addvl x28, x28, #4\n" "udot z21.s, z5.b, z1.b[0]\n" "udot z18.s, z6.b, z0.b[0]\n" @@ -470,10 +463,8 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "25:" // Height 2: Multiply loop: unique 4: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 18b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -527,27 +518,27 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" "tbz %x[flags], #5, 27f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" "and z7.d, z19.d, z0.d\n" "and z8.d, z20.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "and z9.d, z21.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z10.d, z22.d, z0.d\n" "asr z8.s, z8.s, #0x1f\n" - "and z4.d, z23.d, z0.d\n" "asr z9.s, z9.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "asr z10.s, z10.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "asr z4.s, z4.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "sqadd z20.s, z20.s, z8.s\n" "sqadd z21.s, z21.s, z9.s\n" + "and z10.d, z22.d, z0.d\n" + "and z4.d, z23.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" "27:" // Height 2: no shift correction @@ -731,11 +722,8 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "36:" // Height 3: Multiply loop: unique 5: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" "cmp x24, #0x10\n" - "prfm pldl1keep, [x21, #0x80]\n" "bgt 35b\n" "37:" // Height 3: Multiply loop: Single iteration only "ld1b { z4.b }, p2/Z, [x28]\n" @@ -745,16 +733,13 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" - "udot z20.s, z4.b, z1.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x21, x21, #0x10\n" - "udot z24.s, z4.b, z2.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" + "udot z24.s, z4.b, z2.b[0]\n" "udot z21.s, z5.b, z1.b[0]\n" "udot z25.s, z5.b, z2.b[0]\n" "udot z18.s, z6.b, z0.b[0]\n" @@ -825,11 +810,8 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "39:" // Height 3: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -899,39 +881,39 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" "tbz %x[flags], #5, 41f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" "and z7.d, z19.d, z0.d\n" "and z8.d, z20.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "and z9.d, z21.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z10.d, z22.d, z0.d\n" "asr z8.s, z8.s, #0x1f\n" - "and z4.d, z23.d, z0.d\n" "asr z9.s, z9.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "asr z10.s, z10.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "asr z4.s, z4.s, #0x1f\n" - "and z5.d, z24.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "sqadd z20.s, z20.s, z8.s\n" "sqadd z21.s, z21.s, z9.s\n" + "and z10.d, z22.d, z0.d\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z24.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" - "and z6.d, z25.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z24.s, z24.s, z5.s\n" + "and z6.d, z25.d, z0.d\n" "and z7.d, z26.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" "and z8.d, z27.d, z0.d\n" - "sqadd z25.s, z25.s, z6.s\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z8.s, z8.s, #0x1f\n" + "sqadd z25.s, z25.s, z6.s\n" "sqadd z26.s, z26.s, z7.s\n" "sqadd z27.s, z27.s, z8.s\n" "41:" // Height 3: no shift correction @@ -1165,12 +1147,8 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" "50:" // Height 4: Multiply loop: unique 7: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" "sub x24, x24, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" "cmp x24, #0x10\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "bgt 49b\n" "51:" // Height 4: Multiply loop: Single iteration only "ld1b { z4.b }, p2/Z, [x28]\n" @@ -1180,19 +1158,15 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" - "udot z20.s, z4.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" - "udot z24.s, z4.b, z2.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "add x20, x20, #0x10\n" "udot z21.s, z5.b, z1.b[0]\n" "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "addvl x28, x28, #4\n" + "udot z24.s, z4.b, z2.b[0]\n" "udot z28.s, z4.b, z3.b[0]\n" "udot z25.s, z5.b, z2.b[0]\n" "udot z29.s, z5.b, z3.b[0]\n" @@ -1279,12 +1253,8 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" "53:" // Height 4: Multiply loop: unique 8: skip row sum - "prfm pldl1keep, [x23, #0x80]\n" - "add x25, x25, #0x1\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" "cmp x25, x19\n" "bne 46b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1370,52 +1340,52 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" "tbz %x[flags], #5, 55f\n" "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" "and z6.d, z18.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" "and z7.d, z19.d, z0.d\n" "and z8.d, z20.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "and z9.d, z21.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z10.d, z22.d, z0.d\n" "asr z8.s, z8.s, #0x1f\n" - "and z4.d, z23.d, z0.d\n" "asr z9.s, z9.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "asr z10.s, z10.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "asr z4.s, z4.s, #0x1f\n" - "and z5.d, z24.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z19.s, z19.s, z7.s\n" "sqadd z20.s, z20.s, z8.s\n" "sqadd z21.s, z21.s, z9.s\n" + "and z10.d, z22.d, z0.d\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z24.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" - "and z6.d, z25.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" "sqadd z24.s, z24.s, z5.s\n" + "and z6.d, z25.d, z0.d\n" "and z7.d, z26.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" "and z8.d, z27.d, z0.d\n" - "and z9.d, z28.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" "asr z8.s, z8.s, #0x1f\n" "sqadd z25.s, z25.s, z6.s\n" + "sqadd z26.s, z26.s, z7.s\n" + "sqadd z27.s, z27.s, z8.s\n" + "and z9.d, z28.d, z0.d\n" "and z10.d, z29.d, z0.d\n" - "asr z9.s, z9.s, #0x1f\n" "and z4.d, z30.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" "asr z10.s, z10.s, #0x1f\n" - "sqadd z26.s, z26.s, z7.s\n" - "and z5.d, z31.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" - "sqadd z27.s, z27.s, z8.s\n" - "asr z5.s, z5.s, #0x1f\n" "sqadd z28.s, z28.s, z9.s\n" "sqadd z29.s, z29.s, z10.s\n" "sqadd z30.s, z30.s, z4.s\n" + "and z5.d, z31.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" "sqadd z31.s, z31.s, z5.s\n" "55:" // Height 4: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -1529,4 +1499,4 @@ void sve_hybrid_u8qa_dot_4x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp new file mode 100644 index 0000000000..da27554a0f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_u8qa_mmla_4x4VL( ARGLIST ); + +class cls_sve_hybrid_u8qa_mmla_4x4VL +{ +public: + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; + typedef uint8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 47.30 }; + case CPUModel::A510: + return { 20.91 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_u8qa_mmla_4x4VL; + cls_sve_hybrid_u8qa_mmla_4x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp new file mode 100644 index 0000000000..0f3f5e35e1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp @@ -0,0 +1,1418 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_u8qa_mmla_4x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 43f\n" + "cmp %x[M], #0x2\n" + "bgt 29f\n" + "beq 15f\n" + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov z15.b, #0x1\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "ble 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x23, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "tbnz %x[flags], #31, 8f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "subs x24, x24, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ble 10f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + "addvl x28, x28, #8\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "10:" // Height 1: Multiply loop: multiply skip + "tbnz %x[flags], #31, 11f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "11:" // Height 1: Multiply loop: unique 2: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 4b\n" + "uzp1 z16.d, z16.d, z20.d\n" + "uzp1 z17.d, z17.d, z21.d\n" + "uzp1 z18.d, z18.d, z22.d\n" + "uzp1 z19.d, z19.d, z23.d\n" + "mov z23.d, z16.d\n" + "tbnz %x[flags], #31, 12f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z1.s }, p2/Z, [x22]\n" + "mov z11.s, z11.s[0]\n" + "neg z1.s, p2/M, z1.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" + "12:" // Height 1: skip row sum fixup + "add z23.s, z23.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z23.s, z23.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "tbz %x[flags], #5, 13f\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "and z7.d, z19.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z23.s, z23.s, z4.s\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "13:" // Height 1: no shift correction + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + "add z23.s, z23.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z23.h, z23.h, z17.h\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z23.b, z23.b, z17.b\n" + "st1b { z23.b }, p1, [x26]\n" + "addvl x26, x26, #1\n" + "14:" // Height 1: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 2b\n" + "b 58f\n" + "15:" // Height 2 + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z15.b, #0x1\n" + "mov x26, %x[output_ptr]\n" + "16:" // Height 2: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "17:" // Height 2: setup done + "mov x25, #0x0\n" + "18:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 20f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "20:" // Height 2: input setup done + "cmp x24, #0x10\n" + "ble 23f\n" + "21:" // Height 2: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "add x23, x23, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "tbnz %x[flags], #31, 22f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "22:" // Height 2: Multiply loop: unique 3: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 21b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ble 24f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + "addvl x28, x28, #8\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "24:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 25f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "25:" // Height 2: Multiply loop: unique 4: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 18b\n" + "uzp1 z7.d, z16.d, z20.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z16.d, z16.d, z20.d\n" + "add x21, x26, x19\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "mov z23.d, z7.d\n" + "tbnz %x[flags], #31, 26f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z2.s }, p2/Z, [x22]\n" + "mov z12.s, z11.s[3]\n" + "mov z11.s, z11.s[0]\n" + "neg z2.s, p2/M, z2.s\n" + "mul z11.s, p2/M, z11.s, z2.s\n" + "mul z12.s, p2/M, z12.s, z2.s\n" + "26:" // Height 2: skip row sum fixup + "add z23.s, z23.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z20.s, z20.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z21.s, z21.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z22.s, z22.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z16.s, z16.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z17.s, z17.s, z12.s\n" + "add z18.s, z18.s, z12.s\n" + "add z19.s, z19.s, z12.s\n" + "add z23.s, z23.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "tbz %x[flags], #5, 27f\n" + "and z4.d, z23.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z6.d, z21.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z23.s, z23.s, z4.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "and z7.d, z22.d, z0.d\n" + "and z8.d, z16.d, z0.d\n" + "and z9.d, z17.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z22.s, z22.s, z7.s\n" + "sqadd z16.s, z16.s, z8.s\n" + "sqadd z17.s, z17.s, z9.s\n" + "and z10.d, z18.d, z0.d\n" + "and z4.d, z19.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z18.s, z18.s, z10.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "27:" // Height 2: no shift correction + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add z23.s, z23.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z23.h, z23.h, z20.h\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "uzp1 z23.b, z23.b, z20.b\n" + "st1b { z23.b }, p1, [x26]\n" + "add z17.s, z17.s, z4.s\n" + "addvl x26, x26, #1\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "28:" // Height 2: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 16b\n" + "b 58f\n" + "29:" // Height 3 + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "mov x26, %x[output_ptr]\n" + "mov z15.b, #0x1\n" + "30:" // Height 3: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "31:" // Height 3: setup done + "mov x25, #0x0\n" + "32:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 34f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "34:" // Height 3: input setup done + "cmp x24, #0x10\n" + "ble 37f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "add x23, x23, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 36f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "udot z13.s, z3.b, z15.b\n" + "36:" // Height 3: Multiply loop: unique 5: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 35b\n" + "37:" // Height 3: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x24, x24, #0x8\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + "ble 38f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "38:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 39f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "udot z13.s, z3.b, z15.b\n" + "39:" // Height 3: Multiply loop: unique 6: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 32b\n" + "uzp1 z7.d, z16.d, z20.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z16.d, z16.d, z20.d\n" + "add x21, x26, x19\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "add x20, x21, x19\n" + "uzp1 z21.d, z18.d, z22.d\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z24.d, z24.d, z28.d\n" + "uzp1 z25.d, z25.d, z29.d\n" + "uzp1 z26.d, z26.d, z30.d\n" + "uzp1 z27.d, z27.d, z31.d\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z3.s }, p2/Z, [x22]\n" + ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" + "mov z12.s, z11.s[3]\n" + "mov z11.s, z11.s[0]\n" + "neg z3.s, p2/M, z3.s\n" + "mov z13.s, z13.s[0]\n" + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" + "mul z13.s, p2/M, z13.s, z3.s\n" + "40:" // Height 3: skip row sum fixup + "add z31.s, z31.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z20.s, z20.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z21.s, z21.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z22.s, z22.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z16.s, z16.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z17.s, z17.s, z12.s\n" + "add z18.s, z18.s, z12.s\n" + "add z19.s, z19.s, z12.s\n" + "add z24.s, z24.s, z13.s\n" + "add z25.s, z25.s, z13.s\n" + "add z26.s, z26.s, z13.s\n" + "add z27.s, z27.s, z13.s\n" + "add z31.s, z31.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "tbz %x[flags], #5, 41f\n" + "and z4.d, z31.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z6.d, z21.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z31.s, z31.s, z4.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "and z7.d, z22.d, z0.d\n" + "and z8.d, z16.d, z0.d\n" + "and z9.d, z17.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z22.s, z22.s, z7.s\n" + "sqadd z16.s, z16.s, z8.s\n" + "sqadd z17.s, z17.s, z9.s\n" + "and z10.d, z18.d, z0.d\n" + "and z4.d, z19.d, z0.d\n" + "and z5.d, z24.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z18.s, z18.s, z10.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "sqadd z24.s, z24.s, z5.s\n" + "and z6.d, z25.d, z0.d\n" + "and z7.d, z26.d, z0.d\n" + "and z8.d, z27.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z25.s, z25.s, z6.s\n" + "sqadd z26.s, z26.s, z7.s\n" + "sqadd z27.s, z27.s, z8.s\n" + "41:" // Height 3: no shift correction + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z31.h, z31.h, z20.h\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "uzp1 z31.b, z31.b, z20.b\n" + "st1b { z31.b }, p1, [x26]\n" + "add z17.s, z17.s, z4.s\n" + "addvl x26, x26, #1\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "add z26.s, z26.s, z4.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x20]\n" + "42:" // Height 3: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 30b\n" + "b 58f\n" + "43:" // Height 4 + "mov z11.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" + "mov z14.s, #0x0\n" + "mov x19, #0x4\n" + "mov z15.b, #0x1\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "44:" // Height 4: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x9\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "45:" // Height 4: setup done + "mov x25, #0x0\n" + "46:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 47f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x25, 48f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 48f\n" + "47:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "48:" // Height 4: input setup done + "cmp x24, #0x10\n" + "ble 51f\n" + "49:" // Height 4: Multiply loop: Main loop head + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "add x23, x23, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x20]\n" + "add x21, x21, #0x10\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x20, x20, #0x10\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 50f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "udot z13.s, z3.b, z15.b\n" + "50:" // Height 4: Multiply loop: unique 7: skip row sum + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 49b\n" + "51:" // Height 4: Multiply loop: Single iteration only + "ld1b { z5.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x21]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x20]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" + ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + "ble 52f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #8\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "52:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 53f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "udot z11.s, z1.b, z15.b\n" + "udot z13.s, z3.b, z15.b\n" + "53:" // Height 4: Multiply loop: unique 8: skip row sum + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 46b\n" + "uzp1 z7.d, z16.d, z20.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z16.d, z16.d, z20.d\n" + "add x21, x26, x19\n" + "uzp1 z20.d, z17.d, z21.d\n" + "uzp2 z17.d, z17.d, z21.d\n" + "add x20, x21, x19\n" + "uzp1 z21.d, z18.d, z22.d\n" + "add x19, x20, x19\n" + "uzp2 z18.d, z18.d, z22.d\n" + "uzp1 z22.d, z19.d, z23.d\n" + "uzp2 z19.d, z19.d, z23.d\n" + "uzp1 z23.d, z24.d, z28.d\n" + "uzp2 z24.d, z24.d, z28.d\n" + "uzp1 z28.d, z25.d, z29.d\n" + "uzp2 z25.d, z25.d, z29.d\n" + "uzp1 z29.d, z26.d, z30.d\n" + "uzp2 z26.d, z26.d, z30.d\n" + "uzp1 z30.d, z27.d, z31.d\n" + "uzp2 z27.d, z27.d, z31.d\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 54f\n" + ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" + "mov z12.s, z11.s[3]\n" + "mov z11.s, z11.s[0]\n" + "neg z4.s, p2/M, z4.s\n" + "mov z14.s, z13.s[3]\n" + "mov z13.s, z13.s[0]\n" + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "54:" // Height 4: skip row sum fixup + "add z31.s, z31.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x27]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z20.s, z20.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "add z21.s, z21.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add z22.s, z22.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "add z16.s, z16.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "add z17.s, z17.s, z12.s\n" + "add z18.s, z18.s, z12.s\n" + "add z19.s, z19.s, z12.s\n" + "add z23.s, z23.s, z13.s\n" + "add z28.s, z28.s, z13.s\n" + "add z29.s, z29.s, z13.s\n" + "add z30.s, z30.s, z13.s\n" + "add z24.s, z24.s, z14.s\n" + "add z25.s, z25.s, z14.s\n" + "add z26.s, z26.s, z14.s\n" + "add z27.s, z27.s, z14.s\n" + "add z31.s, z31.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z28.s, z28.s, z1.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" + ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "tbz %x[flags], #5, 55f\n" + "and z4.d, z31.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z6.d, z21.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z31.s, z31.s, z4.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "and z7.d, z22.d, z0.d\n" + "and z8.d, z16.d, z0.d\n" + "and z9.d, z17.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z22.s, z22.s, z7.s\n" + "sqadd z16.s, z16.s, z8.s\n" + "sqadd z17.s, z17.s, z9.s\n" + "and z10.d, z18.d, z0.d\n" + "and z4.d, z19.d, z0.d\n" + "and z5.d, z23.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z18.s, z18.s, z10.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "sqadd z23.s, z23.s, z5.s\n" + "and z6.d, z28.d, z0.d\n" + "and z7.d, z29.d, z0.d\n" + "and z8.d, z30.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z28.s, z28.s, z6.s\n" + "sqadd z29.s, z29.s, z7.s\n" + "sqadd z30.s, z30.s, z8.s\n" + "and z9.d, z24.d, z0.d\n" + "and z10.d, z25.d, z0.d\n" + "and z4.d, z26.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z24.s, z24.s, z9.s\n" + "sqadd z25.s, z25.s, z10.s\n" + "sqadd z26.s, z26.s, z4.s\n" + "and z5.d, z27.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z27.s, z27.s, z5.s\n" + "55:" // Height 4: no shift correction + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add x22, %x[qp], %[minval]\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x22]\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z31.h, z31.h, z20.h\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "uzp1 z20.h, z21.h, z22.h\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "uzp1 z31.b, z31.b, z20.b\n" + "st1b { z31.b }, p1, [x26]\n" + "add z17.s, z17.s, z4.s\n" + "addvl x26, x26, #1\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x21]\n" + "add z29.s, z29.s, z4.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "add z30.s, z30.s, z4.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "uzp1 z23.h, z23.h, z28.h\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "add z25.s, z25.s, z4.s\n" + "uzp1 z28.h, z29.h, z30.h\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "add z26.s, z26.s, z4.s\n" + "uzp1 z23.b, z23.b, z28.b\n" + "st1b { z23.b }, p1, [x20]\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x19]\n" + "56:" // Height 4: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 44b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 58f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 57f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "57:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "58:" // Exit + + : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp index a2883bfa30..d870711c6e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,9 +22,10 @@ * IN THE SOFTWARE. */ #pragma once -#ifdef ARM_COMPUTE_ENABLE_SVE +#ifdef ARM_COMPUTE_ENABLE_SVE #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" #define ARGLIST \ unsigned int, const unsigned int *, \ @@ -42,7 +43,8 @@ void sve_hybrid_u8u32_dot_6x4VL( ARGLIST ); class cls_sve_hybrid_u8u32_dot_6x4VL { public: - typedef uint8_t operand_type; + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; typedef uint32_t result_type; typedef void (*kern_type)( ARGLIST ); @@ -68,7 +70,36 @@ public: return true; } - StdTransformsSVE transforms = {}; + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.56 }; + case CPUModel::A510: + return { 20.98 }; + case CPUModel::V1: + return { 62.19 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.59, 15.67, 0.61 }; + case CPUModel::A510: + return { 22.75, 3.90, 0.47 }; + case CPUModel::V1: + return { 62.97, 19.27, 0.92 }; + } + } + + return { 1.0 }; + } // Default to the generic kernel kern_type kernel=sve_hybrid_u8u32_dot_6x4VL; @@ -80,4 +111,5 @@ public: } // namespace arm_gemm #undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp new file mode 100644 index 0000000000..11f9165a3f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp @@ -0,0 +1,1033 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_u8u32_dot_6x4VL_a64fx ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const uint32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + "ptrue p4.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 51f\n" + "cmp %x[M], #0x4\n" + "bgt 41f\n" + "beq 31f\n" + "cmp %x[M], #0x2\n" + "bgt 21f\n" + "beq 11f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 3f\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "b 4f\n" + "3:" // Height 1: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "4:" // Height 1: setup done + "mov x27, #0x0\n" + "5:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 9f\n" + "8:" // Height 1: Multiply loop: Main loop + "udot z8.s, z6.b, z0.b\n" + "udot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x25, x25, #0x4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z11.s, z7.b, z0.b\n" + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 8b\n" + "9:" // Height 1: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "udot z8.s, z6.b, z0.b\n" + "udot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "udot z10.s, z6.b, z0.b\n" + "udot z11.s, z7.b, z0.b\n" + "addvl x9, x9, #4\n" + "bne 5b\n" + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "10:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 62f\n" + "11:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "12:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 13f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "b 14f\n" + "13:" // Height 2: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "14:" // Height 2: setup done + "mov x27, #0x0\n" + "15:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 17f\n" + "16:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "17:" // Height 2: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 19f\n" + "18:" // Height 2: Multiply loop: Main loop + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x4\n" + "udot z9.s, z7.b, z0.b\n" + "udot z13.s, z7.b, z1.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "subs x26, x26, #0x4\n" + "add x24, x24, #0x4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 18b\n" + "19:" // Height 2: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b\n" + "udot z13.s, z7.b, z1.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "addvl x9, x9, #4\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "bne 15b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "20:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 12b\n" + "b 62f\n" + "21:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "22:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 23f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "b 24f\n" + "23:" // Height 3: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "24:" // Height 3: setup done + "mov x27, #0x0\n" + "25:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 26f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 27f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 27f\n" + "26:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "27:" // Height 3: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 29f\n" + "28:" // Height 3: Multiply loop: Main loop + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "udot z16.s, z6.b, z2.b\n" + "udot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x4\n" + "udot z13.s, z7.b, z1.b\n" + "udot z17.s, z7.b, z2.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "add x23, x23, #0x4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z11.s, z7.b, z0.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "udot z15.s, z7.b, z1.b\n" + "udot z19.s, z7.b, z2.b\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 28b\n" + "29:" // Height 3: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "udot z16.s, z6.b, z2.b\n" + "udot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "udot z13.s, z7.b, z1.b\n" + "udot z17.s, z7.b, z2.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "udot z19.s, z7.b, z2.b\n" + "bne 25b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "30:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 22b\n" + "b 62f\n" + "31:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "32:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 33f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "b 34f\n" + "33:" // Height 4: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "34:" // Height 4: setup done + "mov x27, #0x0\n" + "35:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 37f\n" + "36:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "37:" // Height 4: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 39f\n" + "38:" // Height 4: Multiply loop: Main loop + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "udot z16.s, z6.b, z2.b\n" + "udot z20.s, z6.b, z3.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x24, x24, #0x4\n" + "udot z9.s, z7.b, z0.b\n" + "udot z13.s, z7.b, z1.b\n" + "add x23, x23, #0x4\n" + "add x22, x22, #0x4\n" + "udot z17.s, z7.b, z2.b\n" + "udot z21.s, z7.b, z3.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z22.s, z6.b, z3.b\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "udot z19.s, z7.b, z2.b\n" + "udot z23.s, z7.b, z3.b\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 38b\n" + "39:" // Height 4: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "udot z16.s, z6.b, z2.b\n" + "udot z20.s, z6.b, z3.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "cmp x27, x19\n" + "udot z9.s, z7.b, z0.b\n" + "udot z13.s, z7.b, z1.b\n" + "udot z17.s, z7.b, z2.b\n" + "udot z21.s, z7.b, z3.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z22.s, z6.b, z3.b\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "udot z19.s, z7.b, z2.b\n" + "udot z23.s, z7.b, z3.b\n" + "bne 35b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "40:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 32b\n" + "b 62f\n" + "41:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "42:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" + "b 44f\n" + "43:" // Height 5: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "44:" // Height 5: setup done + "mov x27, #0x0\n" + "45:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 47f\n" + "46:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "47:" // Height 5: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 49f\n" + "48:" // Height 5: Multiply loop: Main loop + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "udot z16.s, z6.b, z2.b\n" + "udot z20.s, z6.b, z3.b\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "udot z24.s, z6.b, z4.b\n" + "udot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x4\n" + "udot z13.s, z7.b, z1.b\n" + "udot z17.s, z7.b, z2.b\n" + "add x21, x21, #0x4\n" + "udot z21.s, z7.b, z3.b\n" + "udot z25.s, z7.b, z4.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z22.s, z6.b, z3.b\n" + "udot z26.s, z6.b, z4.b\n" + "udot z11.s, z7.b, z0.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "udot z15.s, z7.b, z1.b\n" + "udot z19.s, z7.b, z2.b\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "udot z23.s, z7.b, z3.b\n" + "udot z27.s, z7.b, z4.b\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 48b\n" + "49:" // Height 5: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "udot z16.s, z6.b, z2.b\n" + "udot z20.s, z6.b, z3.b\n" + "cmp x27, x19\n" + "udot z24.s, z6.b, z4.b\n" + "udot z9.s, z7.b, z0.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "udot z13.s, z7.b, z1.b\n" + "udot z17.s, z7.b, z2.b\n" + "udot z21.s, z7.b, z3.b\n" + "udot z25.s, z7.b, z4.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z22.s, z6.b, z3.b\n" + "udot z26.s, z6.b, z4.b\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "udot z19.s, z7.b, z2.b\n" + "udot z23.s, z7.b, z3.b\n" + "udot z27.s, z7.b, z4.b\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x20]\n" + "st1w { z25.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x20, #3, MUL VL]\n" + "50:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 42b\n" + "b 62f\n" + "51:" // Height 6 + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "52:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "incw x19\n" + "whilelt p0.s, x19, x10\n" + "tbz %x[flags], #0, 53f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x28]\n" + "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" + "ld1w { z28.s }, p3/Z, [x19]\n" + "ld1w { z29.s }, p2/Z, [x19, #1, MUL VL]\n" + "ld1w { z30.s }, p1/Z, [x19, #2, MUL VL]\n" + "ld1w { z31.s }, p0/Z, [x19, #3, MUL VL]\n" + "b 54f\n" + "53:" // Height 6: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "54:" // Height 6: setup done + "mov x27, #0x0\n" + "55:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w26, [x19, x27, LSL #0x2]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 56f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 57f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 57f\n" + "56:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "57:" // Height 6: input setup done + "subs x26, x26, #0x4\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "ble 59f\n" + "58:" // Height 6: Multiply loop: Main loop + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x25, x25, #0x4\n" + "subs x26, x26, #0x4\n" + "udot z16.s, z6.b, z2.b\n" + "udot z20.s, z6.b, z3.b\n" + "add x24, x24, #0x4\n" + "add x23, x23, #0x4\n" + "udot z24.s, z6.b, z4.b\n" + "udot z28.s, z6.b, z5.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "add x22, x22, #0x4\n" + "udot z9.s, z7.b, z0.b\n" + "udot z13.s, z7.b, z1.b\n" + "add x21, x21, #0x4\n" + "add x20, x20, #0x4\n" + "udot z17.s, z7.b, z2.b\n" + "udot z21.s, z7.b, z3.b\n" + "udot z25.s, z7.b, z4.b\n" + "udot z29.s, z7.b, z5.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z22.s, z6.b, z3.b\n" + "udot z26.s, z6.b, z4.b\n" + "udot z30.s, z6.b, z5.b\n" + "ld1b { z6.b }, p4/Z, [x9]\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "ld1rw { z0.s }, p4/Z, [x25]\n" + "ld1rw { z1.s }, p4/Z, [x24]\n" + "udot z19.s, z7.b, z2.b\n" + "udot z23.s, z7.b, z3.b\n" + "ld1rw { z2.s }, p4/Z, [x23]\n" + "ld1rw { z3.s }, p4/Z, [x22]\n" + "udot z27.s, z7.b, z4.b\n" + "udot z31.s, z7.b, z5.b\n" + "ld1rw { z4.s }, p4/Z, [x21]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n" + "bgt 58b\n" + "59:" // Height 6: Multiply loop: Main loop skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "udot z8.s, z6.b, z0.b\n" + "udot z12.s, z6.b, z1.b\n" + "add x27, x27, #0x1\n" + "udot z16.s, z6.b, z2.b\n" + "udot z20.s, z6.b, z3.b\n" + "cmp x27, x19\n" + "udot z24.s, z6.b, z4.b\n" + "udot z28.s, z6.b, z5.b\n" + "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b\n" + "udot z13.s, z7.b, z1.b\n" + "udot z17.s, z7.b, z2.b\n" + "udot z21.s, z7.b, z3.b\n" + "udot z25.s, z7.b, z4.b\n" + "udot z29.s, z7.b, z5.b\n" + "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z10.s, z6.b, z0.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z18.s, z6.b, z2.b\n" + "udot z22.s, z6.b, z3.b\n" + "udot z26.s, z6.b, z4.b\n" + "udot z30.s, z6.b, z5.b\n" + "udot z11.s, z7.b, z0.b\n" + "udot z15.s, z7.b, z1.b\n" + "udot z19.s, z7.b, z2.b\n" + "udot z23.s, z7.b, z3.b\n" + "udot z27.s, z7.b, z4.b\n" + "udot z31.s, z7.b, z5.b\n" + "bne 55b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z8.s }, p3, [x28]\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z9.s }, p2, [x28, #1, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "st1w { z10.s }, p1, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p0, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x20]\n" + "st1w { z25.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x20, #3, MUL VL]\n" + "st1w { z28.s }, p3, [x19]\n" + "st1w { z29.s }, p2, [x19, #1, MUL VL]\n" + "st1w { z30.s }, p1, [x19, #2, MUL VL]\n" + "st1w { z31.s }, p0, [x19, #3, MUL VL]\n" + "60:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 52b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 62f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 61f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "61:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "62:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp index 413bc65288..fc8bdb50a9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -137,13 +137,12 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "cmp x26, #0x10\n" + "add x25, x25, #0x10\n" "udot z10.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" - "prfm pldl1keep, [x25, #0x80]\n" "udot z11.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "udot z8.s, z6.b, z0.b[1]\n" @@ -178,7 +177,6 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" @@ -217,9 +215,8 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z10.s, z6.b, z0.b[3]\n" "udot z11.s, z7.b, z0.b[3]\n" "10:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 5b\n" "st1w { z8.s }, p4, [x28]\n" @@ -296,16 +293,14 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "cmp x26, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" @@ -356,9 +351,7 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z13.s, z7.b, z1.b[0]\n" @@ -413,10 +406,8 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z11.s, z7.b, z0.b[3]\n" "udot z15.s, z7.b, z1.b[3]\n" "21:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 16b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -511,21 +502,18 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "cmp x26, #0x10\n" - "udot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" "udot z17.s, z7.b, z2.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "prfm pldl1keep, [x23, #0x80]\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" @@ -590,12 +578,9 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "add x23, x23, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "udot z16.s, z6.b, z2.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" @@ -665,11 +650,8 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z15.s, z7.b, z1.b[3]\n" "udot z19.s, z7.b, z2.b[3]\n" "32:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 27b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -781,26 +763,22 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" + "add x22, x22, #0x10\n" + "udot z17.s, z7.b, z2.b[0]\n" "udot z20.s, z6.b, z3.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "udot z21.s, z7.b, z3.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" - "prfm pldl1keep, [x22, #0x80]\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" @@ -879,19 +857,15 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "udot z16.s, z6.b, z2.b[0]\n" - "add x22, x22, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" + "udot z16.s, z6.b, z2.b[0]\n" "udot z20.s, z6.b, z3.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "udot z17.s, z7.b, z2.b[0]\n" "udot z21.s, z7.b, z3.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" @@ -972,12 +946,8 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z19.s, z7.b, z2.b[3]\n" "udot z23.s, z7.b, z3.b[3]\n" "43:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 38b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1106,32 +1076,27 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + "udot z17.s, z7.b, z2.b[0]\n" "add x21, x21, #0x10\n" "udot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" "udot z24.s, z6.b, z4.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "udot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "udot z25.s, z7.b, z4.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "udot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" "udot z22.s, z6.b, z3.b[0]\n" @@ -1223,22 +1188,17 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "udot z16.s, z6.b, z2.b[0]\n" + "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" - "add x21, x21, #0x10\n" - "udot z17.s, z7.b, z2.b[0]\n" + "udot z16.s, z6.b, z2.b[0]\n" "udot z20.s, z6.b, z3.b[0]\n" "udot z24.s, z6.b, z4.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "udot z17.s, z7.b, z2.b[0]\n" "udot z21.s, z7.b, z3.b[0]\n" "udot z25.s, z7.b, z4.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" @@ -1334,13 +1294,8 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z23.s, z7.b, z3.b[3]\n" "udot z27.s, z7.b, z4.b[3]\n" "54:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 49b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1489,37 +1444,31 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "udot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + "udot z17.s, z7.b, z2.b[0]\n" "add x20, x20, #0x10\n" "udot z24.s, z6.b, z4.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x10\n" "udot z28.s, z6.b, z5.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" "udot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" "udot z25.s, z7.b, z4.b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" "udot z29.s, z7.b, z5.b[0]\n" "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "udot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x20, #0x80]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" "udot z22.s, z6.b, z3.b[0]\n" @@ -1625,25 +1574,19 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "udot z16.s, z6.b, z2.b[0]\n" + "udot z12.s, z6.b, z1.b[0]\n" "ld1rqb { z4.b }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" "udot z20.s, z6.b, z3.b[0]\n" - "add x20, x20, #0x10\n" - "udot z17.s, z7.b, z2.b[0]\n" "udot z24.s, z6.b, z4.b[0]\n" "udot z28.s, z6.b, z5.b[0]\n" "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "udot z17.s, z7.b, z2.b[0]\n" "udot z21.s, z7.b, z3.b[0]\n" "udot z25.s, z7.b, z4.b[0]\n" "udot z29.s, z7.b, z5.b[0]\n" @@ -1754,14 +1697,8 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z27.s, z7.b, z4.b[3]\n" "udot z31.s, z7.b, z5.b[3]\n" "65:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 60b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp new file mode 100644 index 0000000000..7f8eadc528 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SVE +#include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const uint32_t *, Activation, bool + +namespace arm_gemm +{ +// Actual kernel implementations +void sve_hybrid_u8u32_mmla_6x4VL( ARGLIST ); + +class cls_sve_hybrid_u8u32_mmla_6x4VL +{ +public: + typedef uint8_t lhs_operand_type; + typedef uint8_t rhs_operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 8; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 54.45 }; + case CPUModel::A510: + return { 24.22 }; + case CPUModel::V1: + return { 105.16 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 54.90, 15.69, 0.62 }; + case CPUModel::A510: + return { 26.80, 3.89, 0.47 }; + case CPUModel::V1: + return { 108.33, 18.66, 0.92 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_hybrid_u8u32_mmla_6x4VL; + cls_sve_hybrid_u8u32_mmla_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST + +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp new file mode 100644 index 0000000000..e8bad69ccd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp @@ -0,0 +1,1675 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_u8u32_mmla_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const uint32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 56f\n" + "cmp %x[M], #0x4\n" + "bgt 45f\n" + "beq 34f\n" + "cmp %x[M], #0x2\n" + "bgt 23f\n" + "beq 12f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 3f\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 4f\n" + "3:" // Height 1: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "4:" // Height 1: setup done + "mov x27, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x26, #0x10\n" + "ble 9f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "sub x26, x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "cmp x26, #0x10\n" + "add x25, x25, #0x10\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + "bgt 8b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + "subs x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + "ble 10f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + "10:" // Height 1: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 5b\n" + "uzp1 z8.d, z8.d, z12.d\n" + "st1w { z8.s }, p4, [x28]\n" + "uzp1 z9.d, z9.d, z13.d\n" + "uzp1 z10.d, z10.d, z14.d\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z11.d, z11.d, z15.d\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "11:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 68f\n" + "12:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "13:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 14f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "zip2 z13.d, z10.d, z13.d\n" + "zip1 z10.d, z11.d, z14.d\n" + "zip2 z14.d, z11.d, z14.d\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "b 15f\n" + "14:" // Height 2: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "15:" // Height 2: setup done + "mov x27, #0x0\n" + "16:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 18f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 18f\n" + "17:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "18:" // Height 2: input setup done + "cmp x26, #0x10\n" + "ble 20f\n" + "19:" // Height 2: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "cmp x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x24, x24, #0x10\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + "bgt 19b\n" + "20:" // Height 2: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + "ble 21f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + "21:" // Height 2: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 16b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z8.s }, p4, [x23]\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "22:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 13b\n" + "b 68f\n" + "23:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "24:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 25f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "zip2 z15.d, z16.d, z15.d\n" + "zip1 z16.d, z17.d, z20.d\n" + "zip2 z20.d, z17.d, z20.d\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 26f\n" + "25:" // Height 3: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "26:" // Height 3: setup done + "mov x27, #0x0\n" + "27:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 28f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 29f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 29f\n" + "28:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "29:" // Height 3: input setup done + "cmp x26, #0x10\n" + "ble 31f\n" + "30:" // Height 3: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "cmp x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "add x23, x23, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + "bgt 30b\n" + "31:" // Height 3: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + "ble 32f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + "32:" // Height 3: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 27b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z16.d, z16.d, z20.d\n" + "addvl x28, x28, #4\n" + "uzp1 z17.d, z17.d, z21.d\n" + "st1w { z8.s }, p4, [x23]\n" + "uzp1 z18.d, z18.d, z22.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "uzp1 z19.d, z19.d, z23.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "33:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 24b\n" + "b 68f\n" + "34:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "35:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 36f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "zip1 z17.d, z18.d, z21.d\n" + "zip2 z21.d, z18.d, z21.d\n" + "zip1 z18.d, z19.d, z22.d\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "b 37f\n" + "36:" // Height 4: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "37:" // Height 4: setup done + "mov x27, #0x0\n" + "38:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 39f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 40f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 40f\n" + "39:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "40:" // Height 4: input setup done + "cmp x26, #0x10\n" + "ble 42f\n" + "41:" // Height 4: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "cmp x26, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "add x25, x25, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "add x23, x23, #0x10\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "add x22, x22, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + "bgt 41b\n" + "42:" // Height 4: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + "ble 43f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + "43:" // Height 4: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 38b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z15.d, z16.d, z20.d\n" + "addvl x28, x28, #4\n" + "uzp2 z16.d, z16.d, z20.d\n" + "st1w { z8.s }, p4, [x23]\n" + "uzp1 z20.d, z17.d, z21.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "uzp2 z17.d, z17.d, z21.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "uzp1 z21.d, z18.d, z22.d\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "uzp2 z18.d, z18.d, z22.d\n" + "st1w { z15.s }, p4, [x22]\n" + "uzp1 z22.d, z19.d, z23.d\n" + "st1w { z20.s }, p3, [x22, #1, MUL VL]\n" + "uzp2 z19.d, z19.d, z23.d\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z22.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x21]\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "44:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 35b\n" + "b 68f\n" + "45:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "46:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z25.s }, p4/Z, [x20]\n" + "zip1 z17.d, z18.d, z21.d\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip2 z22.d, z19.d, z22.d\n" + "zip1 z19.d, z24.d, z23.d\n" + "zip2 z23.d, z24.d, z23.d\n" + "zip1 z24.d, z25.d, z28.d\n" + "zip2 z28.d, z25.d, z28.d\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 48f\n" + "47:" // Height 5: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "48:" // Height 5: setup done + "mov x27, #0x0\n" + "49:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 51f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 51f\n" + "50:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "51:" // Height 5: input setup done + "cmp x26, #0x10\n" + "ble 53f\n" + "52:" // Height 5: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "sub x26, x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "cmp x26, #0x10\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + "add x25, x25, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "add x24, x24, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x23, x23, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "add x22, x22, #0x10\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + "add x21, x21, #0x10\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + "bgt 52b\n" + "53:" // Height 5: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "subs x26, x26, #0x8\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" + "ble 54f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + "54:" // Height 5: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 49b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z15.d, z16.d, z20.d\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 z16.d, z16.d, z20.d\n" + "st1w { z8.s }, p4, [x23]\n" + "addvl x28, x28, #4\n" + "uzp1 z20.d, z17.d, z21.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "uzp2 z17.d, z17.d, z21.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "uzp1 z21.d, z18.d, z22.d\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "uzp2 z18.d, z18.d, z22.d\n" + "st1w { z15.s }, p4, [x22]\n" + "uzp1 z22.d, z19.d, z23.d\n" + "st1w { z20.s }, p3, [x22, #1, MUL VL]\n" + "uzp2 z19.d, z19.d, z23.d\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "uzp1 z24.d, z24.d, z28.d\n" + "st1w { z22.s }, p1, [x22, #3, MUL VL]\n" + "uzp1 z25.d, z25.d, z29.d\n" + "st1w { z16.s }, p4, [x21]\n" + "uzp1 z26.d, z26.d, z30.d\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "uzp1 z27.d, z27.d, z31.d\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "55:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 46b\n" + "b 68f\n" + "56:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "57:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x10\n" + "incw x19\n" + "whilelt p3.s, x19, x10\n" + "incw x19\n" + "whilelt p2.s, x19, x10\n" + "incw x19\n" + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 58f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z9.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z9.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "zip2 z12.d, z9.d, z12.d\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "zip1 z9.d, z10.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "zip2 z13.d, z10.d, z13.d\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip1 z10.d, z11.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z11.d, z16.d, z15.d\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "zip2 z15.d, z16.d, z15.d\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "zip1 z16.d, z17.d, z20.d\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip2 z20.d, z17.d, z20.d\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "zip1 z17.d, z18.d, z21.d\n" + "ld1w { z25.s }, p4/Z, [x20]\n" + "zip2 z21.d, z18.d, z21.d\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "zip1 z18.d, z19.d, z22.d\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip2 z22.d, z19.d, z22.d\n" + "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z19.d, z24.d, z23.d\n" + "ld1w { z28.s }, p4/Z, [x19]\n" + "zip2 z23.d, z24.d, z23.d\n" + "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n" + "zip1 z24.d, z25.d, z28.d\n" + "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n" + "zip2 z28.d, z25.d, z28.d\n" + "zip1 z25.d, z26.d, z29.d\n" + "zip2 z29.d, z26.d, z29.d\n" + "zip1 z26.d, z27.d, z30.d\n" + "zip2 z30.d, z27.d, z30.d\n" + "zip1 z27.d, z6.d, z31.d\n" + "zip2 z31.d, z6.d, z31.d\n" + "b 59f\n" + "58:" // Height 6: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "59:" // Height 6: setup done + "mov x27, #0x0\n" + "60:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 61f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x27, 62f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 62f\n" + "61:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "62:" // Height 6: input setup done + "cmp x26, #0x10\n" + "ble 64f\n" + "63:" // Height 6: Multiply loop: Main loop head + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "cmp x26, #0x10\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x25, x25, #0x10\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + "add x24, x24, #0x10\n" + "trn1 z2.d, z3.d, z4.d\n" + "ld1rqb { z6.b }, p0/Z, [x20]\n" + "add x23, x23, #0x10\n" + "trn2 z3.d, z3.d, z4.d\n" + "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + "add x20, x20, #0x10\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + "bgt 63b\n" + "64:" // Height 6: Multiply loop: Single iteration only + "ld1b { z7.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "subs x26, x26, #0x8\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" + "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" + ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" + "ld1rqb { z6.b }, p0/Z, [x20]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" + ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" + ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" + ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" + ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" + ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" + ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" + ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" + ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" + ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" + "ble 65f\n" + "ld1b { z7.b }, p5/Z, [x9]\n" + ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" + "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" + ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" + ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" + ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" + ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" + ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n" + ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" + ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" + ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" + ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" + ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" + ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" + ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" + ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" + ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + "65:" // Height 6: Multiply loop: multiply skip + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 60b\n" + "uzp1 z7.d, z8.d, z12.d\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp2 z8.d, z8.d, z12.d\n" + "st1w { z7.s }, p4, [x28]\n" + "uzp1 z12.d, z9.d, z13.d\n" + "add x23, x28, x19, LSL #2\n" + "uzp2 z9.d, z9.d, z13.d\n" + "st1w { z12.s }, p3, [x28, #1, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "add x22, x23, x19, LSL #2\n" + "uzp2 z10.d, z10.d, z14.d\n" + "st1w { z13.s }, p2, [x28, #2, MUL VL]\n" + "uzp1 z14.d, z11.d, z15.d\n" + "add x21, x22, x19, LSL #2\n" + "uzp2 z11.d, z11.d, z15.d\n" + "st1w { z14.s }, p1, [x28, #3, MUL VL]\n" + "uzp1 z15.d, z16.d, z20.d\n" + "add x20, x21, x19, LSL #2\n" + "uzp2 z16.d, z16.d, z20.d\n" + "st1w { z8.s }, p4, [x23]\n" + "add x19, x20, x19, LSL #2\n" + "uzp1 z20.d, z17.d, z21.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" + "addvl x28, x28, #4\n" + "uzp2 z17.d, z17.d, z21.d\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" + "uzp1 z21.d, z18.d, z22.d\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" + "uzp2 z18.d, z18.d, z22.d\n" + "st1w { z15.s }, p4, [x22]\n" + "uzp1 z22.d, z19.d, z23.d\n" + "st1w { z20.s }, p3, [x22, #1, MUL VL]\n" + "uzp2 z19.d, z19.d, z23.d\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "uzp1 z23.d, z24.d, z28.d\n" + "st1w { z22.s }, p1, [x22, #3, MUL VL]\n" + "uzp2 z24.d, z24.d, z28.d\n" + "st1w { z16.s }, p4, [x21]\n" + "uzp1 z28.d, z25.d, z29.d\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "uzp2 z25.d, z25.d, z29.d\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "uzp1 z29.d, z26.d, z30.d\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "uzp2 z26.d, z26.d, z30.d\n" + "st1w { z23.s }, p4, [x20]\n" + "uzp1 z30.d, z27.d, z31.d\n" + "st1w { z28.s }, p3, [x20, #1, MUL VL]\n" + "uzp2 z27.d, z27.d, z31.d\n" + "st1w { z29.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z30.s }, p1, [x20, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x19]\n" + "st1w { z25.s }, p3, [x19, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x19, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x19, #3, MUL VL]\n" + "66:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 57b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 68f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 67f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "67:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "68:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp index d717b745c9..f5fdf993aa 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,63 +10,92 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE - -#include "../bfloat.hpp" #include "../std_transforms_sve.hpp" +#include "../bfloat.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const bfloat16 *, const bfloat16 *, \ + float *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int); +void sve_interleaved_bf16fp32_dot_8x3VL( ARGLIST ); -class cls_sve_interleaved_bf16fp32_dot_8x3VL { +class cls_sve_interleaved_bf16fp32_dot_8x3VL +{ public: typedef bfloat16 operand_type; typedef float result_type; - typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return get_vector_length() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 2; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 15.92, 3.74, 7.14 }; + case CPUModel::A510: + return { 7.54, 3.77, 2.43 }; + case CPUModel::V1: + return { 31.82, 5.11, 11.20 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL; cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp index 4f774b133f..e604dcc4bc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,320 +10,237 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #ifdef ARM_COMPUTE_ENABLE_SVE +#include #include "../../bfloat.hpp" -#include "../../asmlib.hpp" namespace arm_gemm { -void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; +void sve_interleaved_bf16fp32_dot_8x3VL( + const bfloat16 *Apanel, const bfloat16 *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { - K /= 2; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const bfloat16 *Bpanel = {}; + } ka; - for (int yb=0; yb() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 4; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.41, 4.30, 7.14 }; + case CPUModel::A510: + return { 7.78, 4.01, 2.43 }; + case CPUModel::V1: + return { 62.50, 5.09, 11.32 }; + } + } - kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 30.86, 2.36, 5.28 }; + case CPUModel::A510: + return { 7.75, 2.47, 2.39 }; + case CPUModel::V1: + return { 60.83, 2.69, 8.66 }; + } + } + + return { 1.0 }; + } + + // Default to the generic kernel + kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL; cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp index c720942140..de4f0ad313 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,388 +10,284 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #ifdef ARM_COMPUTE_ENABLE_SVE +#include #include "../../bfloat.hpp" -#include "../../asmlib.hpp" namespace arm_gemm { -void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; +void sve_interleaved_bf16fp32_mmla_8x3VL( + const bfloat16 *Apanel, const bfloat16 *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const bfloat16 *Bpanel = {}; + } ka; - for (int yb=0; yb() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length<__fp16>(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 1; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=sve_interleaved_fp16_mla_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 15.96, 3.85, 6.91 }; + case CPUModel::A510: + return { 13.84, 2.07, 2.52 }; + case CPUModel::V1: + return { 31.90, 5.15, 10.34 }; + } + } + + return { 1.0 }; + } - cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *) + // Default to the generic kernel + kern_type kernel=sve_interleaved_fp16_mla_8x3VL; + cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *ci) { - + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_interleaved_fp16_mla_8x3VL_a64fx; + break; + } } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp new file mode 100644 index 0000000000..602634706e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include + +namespace arm_gemm { + +void sve_interleaved_fp16_mla_8x3VL_a64fx( + const __fp16 *Apanel, const __fp16 *Bpanel, + __fp16 *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const __fp16 *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + "ptrue p0.b\n" + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "ld1h { z0.h }, p0/Z, [x20]\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "ld1h { z1.h }, p0/Z, [x20, #1, MUL VL]\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "ld1h { z2.h }, p0/Z, [x20, #2, MUL VL]\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "fmla z8.h, p0/M, z0.h, z3.h\n" + "fmla z9.h, p0/M, z1.h, z3.h\n" + "sub x19, x19, #0x2\n" + "fmla z10.h, p0/M, z2.h, z3.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" + "fmla z11.h, p0/M, z0.h, z4.h\n" + "fmla z12.h, p0/M, z1.h, z4.h\n" + "fmla z13.h, p0/M, z2.h, z4.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n" + "fmla z14.h, p0/M, z0.h, z5.h\n" + "fmla z15.h, p0/M, z1.h, z5.h\n" + "cmp x19, #0x2\n" + "fmla z16.h, p0/M, z2.h, z5.h\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n" + "fmla z17.h, p0/M, z0.h, z6.h\n" + "fmla z18.h, p0/M, z1.h, z6.h\n" + "fmla z19.h, p0/M, z2.h, z6.h\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" + "fmla z20.h, p0/M, z0.h, z3.h\n" + "fmla z21.h, p0/M, z1.h, z3.h\n" + "fmla z22.h, p0/M, z2.h, z3.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n" + "fmla z23.h, p0/M, z0.h, z4.h\n" + "fmla z24.h, p0/M, z1.h, z4.h\n" + "fmla z25.h, p0/M, z2.h, z4.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n" + "fmla z26.h, p0/M, z0.h, z5.h\n" + "fmla z27.h, p0/M, z1.h, z5.h\n" + "fmla z28.h, p0/M, z2.h, z5.h\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n" + "fmla z29.h, p0/M, z0.h, z6.h\n" + "ld1h { z0.h }, p0/Z, [x20, #3, MUL VL]\n" + "fmla z30.h, p0/M, z1.h, z6.h\n" + "fmla z31.h, p0/M, z2.h, z6.h\n" + "ld1h { z1.h }, p0/Z, [x20, #4, MUL VL]\n" + "ld1h { z2.h }, p0/Z, [x20, #5, MUL VL]\n" + "fmla z8.h, p0/M, z0.h, z3.h\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n" + "fmla z9.h, p0/M, z1.h, z3.h\n" + "fmla z10.h, p0/M, z2.h, z3.h\n" + "fmla z11.h, p0/M, z0.h, z4.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n" + "fmla z12.h, p0/M, z1.h, z4.h\n" + "fmla z13.h, p0/M, z2.h, z4.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n" + "fmla z14.h, p0/M, z0.h, z5.h\n" + "fmla z15.h, p0/M, z1.h, z5.h\n" + "addvl x20, x20, #6\n" + "fmla z16.h, p0/M, z2.h, z5.h\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n" + "fmla z17.h, p0/M, z0.h, z6.h\n" + "fmla z18.h, p0/M, z1.h, z6.h\n" + "fmla z19.h, p0/M, z2.h, z6.h\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla z20.h, p0/M, z0.h, z3.h\n" + "fmla z21.h, p0/M, z1.h, z3.h\n" + "fmla z22.h, p0/M, z2.h, z3.h\n" + "fmla z23.h, p0/M, z0.h, z4.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n" + "fmla z24.h, p0/M, z1.h, z4.h\n" + "fmla z25.h, p0/M, z2.h, z4.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n" + "fmla z26.h, p0/M, z0.h, z5.h\n" + "fmla z27.h, p0/M, z1.h, z5.h\n" + "fmla z28.h, p0/M, z2.h, z5.h\n" + "fmla z29.h, p0/M, z0.h, z6.h\n" + "ld1h { z0.h }, p0/Z, [x20]\n" + "fmla z30.h, p0/M, z1.h, z6.h\n" + "fmla z31.h, p0/M, z2.h, z6.h\n" + "ld1h { z1.h }, p0/Z, [x20, #1, MUL VL]\n" + "ld1h { z2.h }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n" + "bge 3b\n" + "4:" // main loop skip + "fmla z8.h, p0/M, z0.h, z3.h\n" + "fmla z9.h, p0/M, z1.h, z3.h\n" + "addvl x20, x20, #3\n" + "fmla z10.h, p0/M, z2.h, z3.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" + "fmla z11.h, p0/M, z0.h, z4.h\n" + "fmla z12.h, p0/M, z1.h, z4.h\n" + "fmla z13.h, p0/M, z2.h, z4.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n" + "fmla z14.h, p0/M, z0.h, z5.h\n" + "fmla z15.h, p0/M, z1.h, z5.h\n" + "fmla z16.h, p0/M, z2.h, z5.h\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n" + "fmla z17.h, p0/M, z0.h, z6.h\n" + "fmla z18.h, p0/M, z1.h, z6.h\n" + "fmla z19.h, p0/M, z2.h, z6.h\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" + "fmla z20.h, p0/M, z0.h, z3.h\n" + "fmla z21.h, p0/M, z1.h, z3.h\n" + "add %x[Apanel], %x[Apanel], #0x10\n" + "fmla z22.h, p0/M, z2.h, z3.h\n" + "fmla z23.h, p0/M, z0.h, z4.h\n" + "fmla z24.h, p0/M, z1.h, z4.h\n" + "fmla z25.h, p0/M, z2.h, z4.h\n" + "fmla z26.h, p0/M, z0.h, z5.h\n" + "fmla z27.h, p0/M, z1.h, z5.h\n" + "fmla z28.h, p0/M, z2.h, z5.h\n" + "fmla z29.h, p0/M, z0.h, z6.h\n" + "fmla z30.h, p0/M, z1.h, z6.h\n" + "fmla z31.h, p0/M, z2.h, z6.h\n" + "cbz x19, 5f\n" + "ld1h { z0.h }, p0/Z, [x20]\n" + "ld1h { z1.h }, p0/Z, [x20, #1, MUL VL]\n" + "ld1h { z2.h }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n" + "fmla z8.h, p0/M, z0.h, z3.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n" + "fmla z9.h, p0/M, z1.h, z3.h\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n" + "fmla z10.h, p0/M, z2.h, z3.h\n" + "fmla z11.h, p0/M, z0.h, z4.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" + "fmla z12.h, p0/M, z1.h, z4.h\n" + "fmla z13.h, p0/M, z2.h, z4.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n" + "fmla z14.h, p0/M, z0.h, z5.h\n" + "fmla z15.h, p0/M, z1.h, z5.h\n" + "fmla z16.h, p0/M, z2.h, z5.h\n" + "fmla z17.h, p0/M, z0.h, z6.h\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n" + "fmla z18.h, p0/M, z1.h, z6.h\n" + "fmla z19.h, p0/M, z2.h, z6.h\n" + "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" + "addvl x20, x20, #3\n" + "fmla z20.h, p0/M, z0.h, z3.h\n" + "fmla z21.h, p0/M, z1.h, z3.h\n" + "add %x[Apanel], %x[Apanel], #0x10\n" + "fmla z22.h, p0/M, z2.h, z3.h\n" + "fmla z23.h, p0/M, z0.h, z4.h\n" + "fmla z24.h, p0/M, z1.h, z4.h\n" + "fmla z25.h, p0/M, z2.h, z4.h\n" + "fmla z26.h, p0/M, z0.h, z5.h\n" + "fmla z27.h, p0/M, z1.h, z5.h\n" + "fmla z28.h, p0/M, z2.h, z5.h\n" + "fmla z29.h, p0/M, z0.h, z6.h\n" + "fmla z30.h, p0/M, z1.h, z6.h\n" + "fmla z31.h, p0/M, z2.h, z6.h\n" + "5:" // multiply loop done + "st1h { z8.h }, p0, [%x[Cpanel]]\n" + "subs x22, x22, #0x1\n" + "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1h { z15.h }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #16\n" + "st1h { z16.h }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "st1h { z17.h }, p0, [%x[Cpanel], #-7, MUL VL]\n" + "st1h { z18.h }, p0, [%x[Cpanel], #-6, MUL VL]\n" + "st1h { z19.h }, p0, [%x[Cpanel], #-5, MUL VL]\n" + "st1h { z20.h }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "st1h { z21.h }, p0, [%x[Cpanel], #-3, MUL VL]\n" + "st1h { z22.h }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1h { z23.h }, p0, [%x[Cpanel], #-1, MUL VL]\n" + "st1h { z24.h }, p0, [%x[Cpanel]]\n" + "st1h { z25.h }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1h { z26.h }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1h { z27.h }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1h { z28.h }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1h { z29.h }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1h { z30.h }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1h { z31.h }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #8\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp index 0f1937acc5..f8e4b89b95 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,310 +10,232 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #ifdef ARM_COMPUTE_ENABLE_SVE - -#include "../../asmlib.hpp" +#include namespace arm_gemm { -void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; +void sve_interleaved_fp16_mla_8x3VL( + const __fp16 *Apanel, const __fp16 *Bpanel, + __fp16 *Cpanel, int ablocks, int bblocks, int K) { - for (int yb=0; yb() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 1; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=sve_interleaved_fp32_mla_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 7.2307, 3.876, 2.932 }; + } + } - cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *) - { + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=sve_interleaved_fp32_mla_8x3VL; + cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *ci) + { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_interleaved_fp32_mla_8x3VL_a64fx; + break; + } } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp new file mode 100644 index 0000000000..6defe0e223 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include + +namespace arm_gemm { + +void sve_interleaved_fp32_mla_8x3VL_a64fx( + const float *Apanel, const float *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const float *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/1) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + "ptrue p0.b\n" + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "ld1w { z0.s }, p0/Z, [x20]\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "ld1w { z1.s }, p0/Z, [x20, #1, MUL VL]\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "ld1w { z2.s }, p0/Z, [x20, #2, MUL VL]\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "fmla z8.s, p0/M, z0.s, z3.s\n" + "fmla z9.s, p0/M, z1.s, z3.s\n" + "sub x19, x19, #0x2\n" + "fmla z10.s, p0/M, z2.s, z3.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "fmla z11.s, p0/M, z0.s, z4.s\n" + "fmla z12.s, p0/M, z1.s, z4.s\n" + "fmla z13.s, p0/M, z2.s, z4.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "fmla z14.s, p0/M, z0.s, z5.s\n" + "fmla z15.s, p0/M, z1.s, z5.s\n" + "cmp x19, #0x2\n" + "fmla z16.s, p0/M, z2.s, z5.s\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "fmla z17.s, p0/M, z0.s, z6.s\n" + "fmla z18.s, p0/M, z1.s, z6.s\n" + "fmla z19.s, p0/M, z2.s, z6.s\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "fmla z20.s, p0/M, z0.s, z3.s\n" + "fmla z21.s, p0/M, z1.s, z3.s\n" + "fmla z22.s, p0/M, z2.s, z3.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n" + "fmla z23.s, p0/M, z0.s, z4.s\n" + "fmla z24.s, p0/M, z1.s, z4.s\n" + "fmla z25.s, p0/M, z2.s, z4.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n" + "fmla z26.s, p0/M, z0.s, z5.s\n" + "fmla z27.s, p0/M, z1.s, z5.s\n" + "fmla z28.s, p0/M, z2.s, z5.s\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n" + "fmla z29.s, p0/M, z0.s, z6.s\n" + "ld1w { z0.s }, p0/Z, [x20, #3, MUL VL]\n" + "fmla z30.s, p0/M, z1.s, z6.s\n" + "fmla z31.s, p0/M, z2.s, z6.s\n" + "ld1w { z1.s }, p0/Z, [x20, #4, MUL VL]\n" + "ld1w { z2.s }, p0/Z, [x20, #5, MUL VL]\n" + "fmla z8.s, p0/M, z0.s, z3.s\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n" + "fmla z9.s, p0/M, z1.s, z3.s\n" + "fmla z10.s, p0/M, z2.s, z3.s\n" + "fmla z11.s, p0/M, z0.s, z4.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n" + "fmla z12.s, p0/M, z1.s, z4.s\n" + "fmla z13.s, p0/M, z2.s, z4.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n" + "fmla z14.s, p0/M, z0.s, z5.s\n" + "fmla z15.s, p0/M, z1.s, z5.s\n" + "addvl x20, x20, #6\n" + "fmla z16.s, p0/M, z2.s, z5.s\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n" + "fmla z17.s, p0/M, z0.s, z6.s\n" + "fmla z18.s, p0/M, z1.s, z6.s\n" + "fmla z19.s, p0/M, z2.s, z6.s\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + "fmla z20.s, p0/M, z0.s, z3.s\n" + "fmla z21.s, p0/M, z1.s, z3.s\n" + "fmla z22.s, p0/M, z2.s, z3.s\n" + "fmla z23.s, p0/M, z0.s, z4.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "fmla z24.s, p0/M, z1.s, z4.s\n" + "fmla z25.s, p0/M, z2.s, z4.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "fmla z26.s, p0/M, z0.s, z5.s\n" + "fmla z27.s, p0/M, z1.s, z5.s\n" + "fmla z28.s, p0/M, z2.s, z5.s\n" + "fmla z29.s, p0/M, z0.s, z6.s\n" + "ld1w { z0.s }, p0/Z, [x20]\n" + "fmla z30.s, p0/M, z1.s, z6.s\n" + "fmla z31.s, p0/M, z2.s, z6.s\n" + "ld1w { z1.s }, p0/Z, [x20, #1, MUL VL]\n" + "ld1w { z2.s }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "bge 3b\n" + "4:" // main loop skip + "fmla z8.s, p0/M, z0.s, z3.s\n" + "fmla z9.s, p0/M, z1.s, z3.s\n" + "addvl x20, x20, #3\n" + "fmla z10.s, p0/M, z2.s, z3.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "fmla z11.s, p0/M, z0.s, z4.s\n" + "fmla z12.s, p0/M, z1.s, z4.s\n" + "fmla z13.s, p0/M, z2.s, z4.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "fmla z14.s, p0/M, z0.s, z5.s\n" + "fmla z15.s, p0/M, z1.s, z5.s\n" + "fmla z16.s, p0/M, z2.s, z5.s\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "fmla z17.s, p0/M, z0.s, z6.s\n" + "fmla z18.s, p0/M, z1.s, z6.s\n" + "fmla z19.s, p0/M, z2.s, z6.s\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "fmla z20.s, p0/M, z0.s, z3.s\n" + "fmla z21.s, p0/M, z1.s, z3.s\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla z22.s, p0/M, z2.s, z3.s\n" + "fmla z23.s, p0/M, z0.s, z4.s\n" + "fmla z24.s, p0/M, z1.s, z4.s\n" + "fmla z25.s, p0/M, z2.s, z4.s\n" + "fmla z26.s, p0/M, z0.s, z5.s\n" + "fmla z27.s, p0/M, z1.s, z5.s\n" + "fmla z28.s, p0/M, z2.s, z5.s\n" + "fmla z29.s, p0/M, z0.s, z6.s\n" + "fmla z30.s, p0/M, z1.s, z6.s\n" + "fmla z31.s, p0/M, z2.s, z6.s\n" + "cbz x19, 5f\n" + "ld1w { z0.s }, p0/Z, [x20]\n" + "ld1w { z1.s }, p0/Z, [x20, #1, MUL VL]\n" + "ld1w { z2.s }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "fmla z8.s, p0/M, z0.s, z3.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "fmla z9.s, p0/M, z1.s, z3.s\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "fmla z10.s, p0/M, z2.s, z3.s\n" + "fmla z11.s, p0/M, z0.s, z4.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "fmla z12.s, p0/M, z1.s, z4.s\n" + "fmla z13.s, p0/M, z2.s, z4.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "fmla z14.s, p0/M, z0.s, z5.s\n" + "fmla z15.s, p0/M, z1.s, z5.s\n" + "fmla z16.s, p0/M, z2.s, z5.s\n" + "fmla z17.s, p0/M, z0.s, z6.s\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "fmla z18.s, p0/M, z1.s, z6.s\n" + "fmla z19.s, p0/M, z2.s, z6.s\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "addvl x20, x20, #3\n" + "fmla z20.s, p0/M, z0.s, z3.s\n" + "fmla z21.s, p0/M, z1.s, z3.s\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "fmla z22.s, p0/M, z2.s, z3.s\n" + "fmla z23.s, p0/M, z0.s, z4.s\n" + "fmla z24.s, p0/M, z1.s, z4.s\n" + "fmla z25.s, p0/M, z2.s, z4.s\n" + "fmla z26.s, p0/M, z0.s, z5.s\n" + "fmla z27.s, p0/M, z1.s, z5.s\n" + "fmla z28.s, p0/M, z2.s, z5.s\n" + "fmla z29.s, p0/M, z0.s, z6.s\n" + "fmla z30.s, p0/M, z1.s, z6.s\n" + "fmla z31.s, p0/M, z2.s, z6.s\n" + "5:" // multiply loop done + "st1w { z8.s }, p0, [%x[Cpanel]]\n" + "subs x22, x22, #0x1\n" + "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #16\n" + "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n" + "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n" + "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n" + "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n" + "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n" + "st1w { z24.s }, p0, [%x[Cpanel]]\n" + "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #8\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp index 10feaa130b..e02db6ec48 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,319 +10,236 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #ifdef ARM_COMPUTE_ENABLE_SVE - -#include "../../asmlib.hpp" +#include namespace arm_gemm { -void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; +void sve_interleaved_fp32_mla_8x3VL( + const float *Apanel, const float *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { - for (int yb=0; yb #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const int8_t *, const int8_t *, \ + int32_t *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int); +void sve_interleaved_s8s32_dot_8x3VL( ARGLIST ); -class cls_sve_interleaved_s8s32_dot_8x3VL { +class cls_sve_interleaved_s8s32_dot_8x3VL +{ public: typedef int8_t operand_type; typedef int32_t result_type; - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return get_vector_length() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 4; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=sve_interleaved_s8s32_dot_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.66, 4.10, 7.99 }; + case CPUModel::V1: + return { 63.30, 4.97, 11.35 }; + case CPUModel::A510: + return { 27.42, 3.47, 2.88 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.67, 3.57, 0.50 }; + case CPUModel::V1: + return { 63.35, 4.76, 0.77 }; + case CPUModel::A510: + return { 27.47, 1.70, 0.28 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=sve_interleaved_s8s32_dot_8x3VL; cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp new file mode 100644 index 0000000000..5ca4b73b8a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include +#include + +namespace arm_gemm { + +void sve_interleaved_s8s32_dot_8x3VL_a64fx( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + "ptrue p0.b\n" + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "ld1b { z0.b }, p0/Z, [x20]\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "sdot z8.s, z0.b, z3.b\n" + "sdot z9.s, z1.b, z3.b\n" + "sub x19, x19, #0x2\n" + "sdot z10.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "sdot z11.s, z0.b, z4.b\n" + "sdot z12.s, z1.b, z4.b\n" + "sdot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "sdot z14.s, z0.b, z5.b\n" + "sdot z15.s, z1.b, z5.b\n" + "cmp x19, #0x2\n" + "sdot z16.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "sdot z17.s, z0.b, z6.b\n" + "sdot z18.s, z1.b, z6.b\n" + "sdot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "sdot z20.s, z0.b, z3.b\n" + "sdot z21.s, z1.b, z3.b\n" + "sdot z22.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n" + "sdot z23.s, z0.b, z4.b\n" + "sdot z24.s, z1.b, z4.b\n" + "sdot z25.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n" + "sdot z26.s, z0.b, z5.b\n" + "sdot z27.s, z1.b, z5.b\n" + "sdot z28.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n" + "sdot z29.s, z0.b, z6.b\n" + "ld1b { z0.b }, p0/Z, [x20, #3, MUL VL]\n" + "sdot z30.s, z1.b, z6.b\n" + "sdot z31.s, z2.b, z6.b\n" + "ld1b { z1.b }, p0/Z, [x20, #4, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x20, #5, MUL VL]\n" + "sdot z8.s, z0.b, z3.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n" + "sdot z9.s, z1.b, z3.b\n" + "sdot z10.s, z2.b, z3.b\n" + "sdot z11.s, z0.b, z4.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n" + "sdot z12.s, z1.b, z4.b\n" + "sdot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n" + "sdot z14.s, z0.b, z5.b\n" + "sdot z15.s, z1.b, z5.b\n" + "addvl x20, x20, #6\n" + "sdot z16.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n" + "sdot z17.s, z0.b, z6.b\n" + "sdot z18.s, z1.b, z6.b\n" + "sdot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + "sdot z20.s, z0.b, z3.b\n" + "sdot z21.s, z1.b, z3.b\n" + "sdot z22.s, z2.b, z3.b\n" + "sdot z23.s, z0.b, z4.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "sdot z24.s, z1.b, z4.b\n" + "sdot z25.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "sdot z26.s, z0.b, z5.b\n" + "sdot z27.s, z1.b, z5.b\n" + "sdot z28.s, z2.b, z5.b\n" + "sdot z29.s, z0.b, z6.b\n" + "ld1b { z0.b }, p0/Z, [x20]\n" + "sdot z30.s, z1.b, z6.b\n" + "sdot z31.s, z2.b, z6.b\n" + "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "bge 3b\n" + "4:" // main loop skip + "sdot z8.s, z0.b, z3.b\n" + "sdot z9.s, z1.b, z3.b\n" + "addvl x20, x20, #3\n" + "sdot z10.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "sdot z11.s, z0.b, z4.b\n" + "sdot z12.s, z1.b, z4.b\n" + "sdot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "sdot z14.s, z0.b, z5.b\n" + "sdot z15.s, z1.b, z5.b\n" + "sdot z16.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "sdot z17.s, z0.b, z6.b\n" + "sdot z18.s, z1.b, z6.b\n" + "sdot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "sdot z20.s, z0.b, z3.b\n" + "sdot z21.s, z1.b, z3.b\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "sdot z22.s, z2.b, z3.b\n" + "sdot z23.s, z0.b, z4.b\n" + "sdot z24.s, z1.b, z4.b\n" + "sdot z25.s, z2.b, z4.b\n" + "sdot z26.s, z0.b, z5.b\n" + "sdot z27.s, z1.b, z5.b\n" + "sdot z28.s, z2.b, z5.b\n" + "sdot z29.s, z0.b, z6.b\n" + "sdot z30.s, z1.b, z6.b\n" + "sdot z31.s, z2.b, z6.b\n" + "cbz x19, 5f\n" + "ld1b { z0.b }, p0/Z, [x20]\n" + "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "sdot z8.s, z0.b, z3.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "sdot z9.s, z1.b, z3.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "sdot z10.s, z2.b, z3.b\n" + "sdot z11.s, z0.b, z4.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "sdot z12.s, z1.b, z4.b\n" + "sdot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "sdot z14.s, z0.b, z5.b\n" + "sdot z15.s, z1.b, z5.b\n" + "sdot z16.s, z2.b, z5.b\n" + "sdot z17.s, z0.b, z6.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "sdot z18.s, z1.b, z6.b\n" + "sdot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "addvl x20, x20, #3\n" + "sdot z20.s, z0.b, z3.b\n" + "sdot z21.s, z1.b, z3.b\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "sdot z22.s, z2.b, z3.b\n" + "sdot z23.s, z0.b, z4.b\n" + "sdot z24.s, z1.b, z4.b\n" + "sdot z25.s, z2.b, z4.b\n" + "sdot z26.s, z0.b, z5.b\n" + "sdot z27.s, z1.b, z5.b\n" + "sdot z28.s, z2.b, z5.b\n" + "sdot z29.s, z0.b, z6.b\n" + "sdot z30.s, z1.b, z6.b\n" + "sdot z31.s, z2.b, z6.b\n" + "5:" // multiply loop done + "st1w { z8.s }, p0, [%x[Cpanel]]\n" + "subs x22, x22, #0x1\n" + "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #16\n" + "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n" + "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n" + "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n" + "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n" + "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n" + "st1w { z24.s }, p0, [%x[Cpanel]]\n" + "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #8\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp index 01c0f8cddc..5fb938b20f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,320 +10,237 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #ifdef ARM_COMPUTE_ENABLE_SVE +#include #include -#include "../../asmlib.hpp" namespace arm_gemm { -void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; +void sve_interleaved_s8s32_dot_8x3VL( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; - for (int yb=0; yb #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const int8_t *, const int8_t *, \ + int32_t *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int); +void sve_interleaved_s8s32_mmla_8x3VL( ARGLIST ); -class cls_sve_interleaved_s8s32_mmla_8x3VL { +class cls_sve_interleaved_s8s32_mmla_8x3VL +{ public: typedef int8_t operand_type; typedef int32_t result_type; - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return get_vector_length() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 8; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 61.98, 3.90, 7.94 }; + case CPUModel::V1: + return { 123.42, 5.00, 11.52 }; + case CPUModel::A510: + return { 43.14, 3.62, 2.90 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 61.97, 3.64, 0.50 }; + case CPUModel::V1: + return { 123.84, 4.93, 0.76 }; + case CPUModel::A510: + return { 43.36, 1.86, 0.28 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL; cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp index 9420210aae..b8f1864af3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,375 +23,271 @@ */ #ifdef ARM_COMPUTE_ENABLE_SVE +#include #include -#include "../../asmlib.hpp" namespace arm_gemm { -void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; +void sve_interleaved_s8s32_mmla_8x3VL( + const int8_t *Apanel, const int8_t *Bpanel, + int32_t *Cpanel, int ablocks, int bblocks, int K) { - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const int8_t *Bpanel = {}; + } ka; - for (int yb=0; yb #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const uint8_t *, const uint8_t *, \ + uint32_t *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); +void sve_interleaved_u8u32_dot_8x3VL( ARGLIST ); -class cls_sve_interleaved_u8u32_dot_8x3VL { +class cls_sve_interleaved_u8u32_dot_8x3VL +{ public: typedef uint8_t operand_type; typedef uint32_t result_type; - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return get_vector_length() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 4; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=sve_interleaved_u8u32_dot_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.66, 4.11, 7.94 }; + case CPUModel::A510: + return { 27.44, 3.41, 2.90 }; + case CPUModel::V1: + return { 63.30, 4.97, 11.52 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 31.67, 4.04, 0.50 }; + case CPUModel::A510: + return { 27.45, 1.65, 0.28 }; + case CPUModel::V1: + return { 63.35, 4.96, 0.77 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=sve_interleaved_u8u32_dot_8x3VL; cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp new file mode 100644 index 0000000000..1e2fb138fd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include +#include + +namespace arm_gemm { + +void sve_interleaved_u8u32_dot_8x3VL_a64fx( + const uint8_t *Apanel, const uint8_t *Bpanel, + uint32_t *Cpanel, int ablocks, int bblocks, int K) { + + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const uint8_t *Bpanel = {}; + } ka; + + ka.bblocks = bblocks; + ka.K = (K/4) - 1; + ka.Bpanel = Bpanel; + + __asm__ __volatile__( + "ptrue p0.b\n" + "1:" // Height loop + "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" + "mov x21, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "2:" // Width loop + "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" + "mov %x[Apanel], x21\n" + "cmp x19, #0x2\n" + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "ld1b { z0.b }, p0/Z, [x20]\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "blt 4f\n" + "3:" // main loop head + "udot z8.s, z0.b, z3.b\n" + "udot z9.s, z1.b, z3.b\n" + "sub x19, x19, #0x2\n" + "udot z10.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "udot z11.s, z0.b, z4.b\n" + "udot z12.s, z1.b, z4.b\n" + "udot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "udot z14.s, z0.b, z5.b\n" + "udot z15.s, z1.b, z5.b\n" + "cmp x19, #0x2\n" + "udot z16.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "udot z17.s, z0.b, z6.b\n" + "udot z18.s, z1.b, z6.b\n" + "udot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "udot z20.s, z0.b, z3.b\n" + "udot z21.s, z1.b, z3.b\n" + "udot z22.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n" + "udot z23.s, z0.b, z4.b\n" + "udot z24.s, z1.b, z4.b\n" + "udot z25.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n" + "udot z26.s, z0.b, z5.b\n" + "udot z27.s, z1.b, z5.b\n" + "udot z28.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n" + "udot z29.s, z0.b, z6.b\n" + "ld1b { z0.b }, p0/Z, [x20, #3, MUL VL]\n" + "udot z30.s, z1.b, z6.b\n" + "udot z31.s, z2.b, z6.b\n" + "ld1b { z1.b }, p0/Z, [x20, #4, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x20, #5, MUL VL]\n" + "udot z8.s, z0.b, z3.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n" + "udot z9.s, z1.b, z3.b\n" + "udot z10.s, z2.b, z3.b\n" + "udot z11.s, z0.b, z4.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n" + "udot z12.s, z1.b, z4.b\n" + "udot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n" + "udot z14.s, z0.b, z5.b\n" + "udot z15.s, z1.b, z5.b\n" + "addvl x20, x20, #6\n" + "udot z16.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n" + "udot z17.s, z0.b, z6.b\n" + "udot z18.s, z1.b, z6.b\n" + "udot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n" + "add %x[Apanel], %x[Apanel], #0x40\n" + "udot z20.s, z0.b, z3.b\n" + "udot z21.s, z1.b, z3.b\n" + "udot z22.s, z2.b, z3.b\n" + "udot z23.s, z0.b, z4.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "udot z24.s, z1.b, z4.b\n" + "udot z25.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "udot z26.s, z0.b, z5.b\n" + "udot z27.s, z1.b, z5.b\n" + "udot z28.s, z2.b, z5.b\n" + "udot z29.s, z0.b, z6.b\n" + "ld1b { z0.b }, p0/Z, [x20]\n" + "udot z30.s, z1.b, z6.b\n" + "udot z31.s, z2.b, z6.b\n" + "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "bge 3b\n" + "4:" // main loop skip + "udot z8.s, z0.b, z3.b\n" + "udot z9.s, z1.b, z3.b\n" + "addvl x20, x20, #3\n" + "udot z10.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "udot z11.s, z0.b, z4.b\n" + "udot z12.s, z1.b, z4.b\n" + "udot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "udot z14.s, z0.b, z5.b\n" + "udot z15.s, z1.b, z5.b\n" + "udot z16.s, z2.b, z5.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "udot z17.s, z0.b, z6.b\n" + "udot z18.s, z1.b, z6.b\n" + "udot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "udot z20.s, z0.b, z3.b\n" + "udot z21.s, z1.b, z3.b\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "udot z22.s, z2.b, z3.b\n" + "udot z23.s, z0.b, z4.b\n" + "udot z24.s, z1.b, z4.b\n" + "udot z25.s, z2.b, z4.b\n" + "udot z26.s, z0.b, z5.b\n" + "udot z27.s, z1.b, z5.b\n" + "udot z28.s, z2.b, z5.b\n" + "udot z29.s, z0.b, z6.b\n" + "udot z30.s, z1.b, z6.b\n" + "udot z31.s, z2.b, z6.b\n" + "cbz x19, 5f\n" + "ld1b { z0.b }, p0/Z, [x20]\n" + "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" + "udot z8.s, z0.b, z3.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" + "udot z9.s, z1.b, z3.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" + "udot z10.s, z2.b, z3.b\n" + "udot z11.s, z0.b, z4.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "udot z12.s, z1.b, z4.b\n" + "udot z13.s, z2.b, z4.b\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "udot z14.s, z0.b, z5.b\n" + "udot z15.s, z1.b, z5.b\n" + "udot z16.s, z2.b, z5.b\n" + "udot z17.s, z0.b, z6.b\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "udot z18.s, z1.b, z6.b\n" + "udot z19.s, z2.b, z6.b\n" + "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" + "addvl x20, x20, #3\n" + "udot z20.s, z0.b, z3.b\n" + "udot z21.s, z1.b, z3.b\n" + "add %x[Apanel], %x[Apanel], #0x20\n" + "udot z22.s, z2.b, z3.b\n" + "udot z23.s, z0.b, z4.b\n" + "udot z24.s, z1.b, z4.b\n" + "udot z25.s, z2.b, z4.b\n" + "udot z26.s, z0.b, z5.b\n" + "udot z27.s, z1.b, z5.b\n" + "udot z28.s, z2.b, z5.b\n" + "udot z29.s, z0.b, z6.b\n" + "udot z30.s, z1.b, z6.b\n" + "udot z31.s, z2.b, z6.b\n" + "5:" // multiply loop done + "st1w { z8.s }, p0, [%x[Cpanel]]\n" + "subs x22, x22, #0x1\n" + "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #16\n" + "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n" + "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n" + "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n" + "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n" + "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n" + "st1w { z24.s }, p0, [%x[Cpanel]]\n" + "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n" + "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "addvl %x[Cpanel], %x[Cpanel], #8\n" + "bgt 2b\n" + "subs %x[ablocks], %x[ablocks], #0x1\n" + "bne 1b\n" + : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) + : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) + : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp index 2139bab69d..f1642d0b21 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,320 +10,237 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #ifdef ARM_COMPUTE_ENABLE_SVE +#include #include -#include "../../asmlib.hpp" namespace arm_gemm { -void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; +void sve_interleaved_u8u32_dot_8x3VL( + const uint8_t *Apanel, const uint8_t *Bpanel, + uint32_t *Cpanel, int ablocks, int bblocks, int K) { - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const uint8_t *Bpanel = {}; + } ka; - for (int yb=0; yb #include "../std_transforms_sve.hpp" +#include "../performance_parameters.hpp" -namespace arm_gemm { +#define ARGLIST \ + const uint8_t *, const uint8_t *, \ + uint32_t *, int, int, int +namespace arm_gemm +{ // Actual kernel implementations -void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); +void sve_interleaved_u8u32_mmla_8x3VL( ARGLIST ); -class cls_sve_interleaved_u8u32_mmla_8x3VL { +class cls_sve_interleaved_u8u32_mmla_8x3VL +{ public: typedef uint8_t operand_type; typedef uint32_t result_type; - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + typedef void (*kern_type)( ARGLIST ); /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + static unsigned int out_width() { return get_vector_length() * 3; } - static unsigned int out_height() + static unsigned int stripe_width() { - return 8; + return get_vector_length(); } - static unsigned int k_unroll() + static constexpr unsigned int k_unroll() { return 8; } - // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; StdTransformsSVE transforms_quantized = {}; + template + static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { - kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL; + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 61.97, 4.11, 7.93 }; + case CPUModel::A510: + return { 43.18, 3.57, 2.89 }; + case CPUModel::V1: + return { 123.47, 5.03, 11.76 }; + } + } + + + if (std::is_same::value) { + switch (ci->get_cpu_model()) { + default: + return { 62.00, 4.08, 0.51 }; + case CPUModel::A510: + return { 38.02, 1.85, 0.28 }; + case CPUModel::V1: + return { 123.84, 4.98, 0.76 }; + } + } + + return { 1.0 }; + } + // Default to the generic kernel + kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL; cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *) { - } }; } // namespace arm_gemm +#undef ARGLIST + #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp index d42385789c..c4fdfa6abc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,375 +23,271 @@ */ #ifdef ARM_COMPUTE_ENABLE_SVE +#include #include -#include "../../asmlib.hpp" namespace arm_gemm { -void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; +void sve_interleaved_u8u32_mmla_8x3VL( + const uint8_t *Apanel, const uint8_t *Bpanel, + uint32_t *Cpanel, int ablocks, int bblocks, int K) { - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const uint8_t *Bpanel = {}; + } ka; - for (int yb=0; yb()) - 1; const long ldab = lda * sizeof(uint8_t); const long ldcb = ldc * sizeof(uint32_t); @@ -115,19 +115,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" "ld1rqb z7.b, p6/z, [a_ptr7]\n" @@ -185,15 +179,7 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z31.s, #0\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "b.ne 4b\n" "3:\n" "st1w z24.s, p7, [%[c_ptr0]]\n" @@ -339,17 +325,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" @@ -419,21 +399,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z31.s, #0\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" - "udot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "udot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" + "udot z31.s, z16.b, z7.b[0]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" "b.ne 4b\n" @@ -598,17 +570,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" @@ -689,21 +655,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z30.s, z16.b, z6.b[0]\n" "addvl %[b_ptr0], %[b_ptr0], #3\n" "udot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "udot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z30.s, z17.b, z6.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z31.s, z17.b, z7.b[1]\n" "udot z24.s, z18.b, z0.b[2]\n" "udot z25.s, z18.b, z1.b[2]\n" @@ -892,17 +850,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" @@ -993,21 +945,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z30.s, z16.b, z6.b[0]\n" "addvl %[b_ptr0], %[b_ptr0], #4\n" "udot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "udot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z30.s, z17.b, z6.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z31.s, z17.b, z7.b[1]\n" "udot z24.s, z18.b, z0.b[2]\n" "udot z25.s, z18.b, z1.b[2]\n" @@ -1221,17 +1165,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "addvl %[b_ptr0], %[b_ptr0], #5\n" "cbz %[loops], 2f\n" "mov z24.s, #0\n" @@ -1312,7 +1250,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" @@ -1350,19 +1287,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z31.s, z16.b, z7.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z30.s, z17.b, z6.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z31.s, z17.b, z7.b[1]\n" "udot z24.s, z18.b, z0.b[2]\n" "udot z25.s, z18.b, z1.b[2]\n" @@ -1641,17 +1571,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "addvl %[b_ptr0], %[b_ptr0], #6\n" "cbz %[loops], 2f\n" @@ -1741,7 +1665,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" @@ -1753,7 +1676,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z27.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z28.s, p7, [c_ptr4]\n" "addvl c_ptr4, c_ptr4, #1\n" "mov z28.s, #0\n" @@ -1781,17 +1703,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z31.s, z16.b, z7.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" "udot z24.s, z18.b, z0.b[2]\n" @@ -2096,17 +2012,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "addvl %[b_ptr0], %[b_ptr0], #7\n" @@ -2205,13 +2115,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -2245,17 +2153,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z30.s, z16.b, z6.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "udot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -2586,17 +2488,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -2704,13 +2600,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -2722,7 +2616,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -2746,15 +2639,10 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" @@ -3111,17 +2999,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -3247,13 +3129,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -3265,7 +3145,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -3289,19 +3168,14 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" "addvl %[b_ptr0], %[b_ptr0], #1\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" "udot z24.s, z18.b, z0.b[2]\n" @@ -3708,17 +3582,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -3853,13 +3721,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -3871,7 +3737,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -3895,17 +3760,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -4341,17 +4201,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -4495,13 +4349,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -4513,7 +4365,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -4537,17 +4388,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -5010,17 +4856,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -5173,13 +5013,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -5191,7 +5029,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -5215,17 +5052,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -5715,17 +5547,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -5895,13 +5721,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -5913,7 +5737,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -5937,17 +5760,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -6488,17 +6306,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -6677,13 +6489,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -6695,7 +6505,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -6719,17 +6528,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -7297,17 +7101,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -7495,13 +7293,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -7513,7 +7309,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -7537,17 +7332,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -8143,17 +7933,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -8350,13 +8134,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -8368,7 +8150,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -8392,17 +8173,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t "udot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "udot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "udot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "udot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "udot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "udot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "udot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "udot z29.s, z17.b, z5.b[1]\n" "udot z30.s, z17.b, z6.b[1]\n" "udot z31.s, z17.b, z7.b[1]\n" @@ -8968,4 +8744,4 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp new file mode 100644 index 0000000000..a7525e5ec1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* As some of the merges need these headers, but are all included in the + * arm_gemm namespace, put these headers here. */ +#include + +#include + +#include "arm_gemm.hpp" +#include "asmlib.hpp" +#include "utils.hpp" + +#include "mergeresults.hpp" + +namespace arm_gemm { + +#include "merges/list-fp16.hpp" + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp index 77d86b7dd8..a4124c4a54 100644 --- a/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp +++ b/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -38,4 +38,4 @@ namespace arm_gemm { #include "merges/list-sve.hpp" -} // namespace arm_gemm \ No newline at end of file +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp index bbfe8f23d9..2b712cee61 100644 --- a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp +++ b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, 2021 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,6 @@ /* As some of the merges need these headers, but are all included in the * arm_gemm namespace, put these headers here. */ #include -#include #include diff --git a/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp b/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp new file mode 100644 index 0000000000..c1356347df --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "a64_merge_fp16_24x8.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp index dae874ef94..3443c6f0a8 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "a32_merge_float_8x6.hpp" -#include "a64_merge_fp16_24x8.hpp" #include "a64_merge_fp32_12x8.hpp" #include "a64_merge_s32_12x8.hpp" #include "a64_merge_s32_4x4.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp index fdb4f584d8..1e2a9acc1d 100644 --- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp +++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -198,6 +198,19 @@ public: _params.bias = bias; _params.bias_multi_stride = bias_multi_stride; } + + GemmConfig get_config() override { + GemmConfig c = _subgemm->get_config(); + + std::string n = "quantize_wrapper["; + n.append(c.filter); + n.append("]"); + + c.method = GemmMethod::QUANTIZE_WRAPPER; + c.filter = n; + + return c; + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/transform-sve.cpp b/src/core/NEON/kernels/arm_gemm/transform-sve.cpp new file mode 100644 index 0000000000..3f6963d32b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transform-sve.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "utils.hpp" + +#include "bfloat.hpp" +#include "transform.hpp" + +#include + +namespace arm_gemm { + +#include "transforms/list-sve.hpp" + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp new file mode 100644 index 0000000000..60376ab80b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transform.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "utils.hpp" + +#include "bfloat.hpp" + +#include + +namespace arm_gemm { + +/* + * Generic transform. + * + * Assuming the untransposed case, this works by first reading + * consecutive values from the first input row. This same number of values + * are then read from the next rows. Now return to the first + * input row and repeat. + * + * Need to cope with the work requested in either dimension not actually + * being a multiple of the block sizes. + */ +template +struct TransformImpl { + template + static void Transform(TOut* out, const TIn* const in, const int stride, + const int y0, const int ymax, const int x0, const int xmax) { + // NOTE: This code is disabled to avoid the call to get_vector_length(), so templated transforms will not be + // correct for SVE. This is not an issue as we have specializations for all SVE cases. + // For SVE cases we multiply the interleave factor by the vector length. + // const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length() / BlockBy : 1); + const unsigned int IntBy = tIntBy; + + const int n_whole_y_blocks = (ymax - y0) / IntBy; + const int y_remainders = (ymax - y0) % IntBy; + const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0); + + const int n_whole_x_blocks = (xmax - x0) / BlockBy; + const int x_remainders = (xmax - x0) % BlockBy; + const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0); + + // "Y" loop: advance down the rows of the source IntBy rows at a time. + // Set up fill_rows to show the number rows to copy from, and blank_rows + // for the number of blank rows to add. + for (int y_block=0 ; y_block < n_y_blocks; y_block++) { + int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders; + int blank_rows = IntBy - fill_rows; + + int y_base = y0 + (y_block * IntBy); + + // So now advance along this block of rows, BlockBy columns at a time. + for (int x_block=0 ; x_block < n_x_blocks; x_block++) { + int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders; + int blank_cols = BlockBy - fill_cols; + + int x_base = x0 + (x_block * BlockBy); + + for (int row = 0; row < fill_rows; row++) { + for (int col = 0; col < fill_cols; col++) { + // In-range copy. If it's transposed, we reverse the sense of rows and columns here. + if (Transposed) { + *out++ = static_cast(in[(x_base + col) * stride + y_base + row]); + } else { + *out++ = static_cast(in[(y_base + row) * stride + x_base + col]); + } + } + // "col" tail - row is in range but column is out of range. + for (int col=0; col < blank_cols; col++) { + *out++ = static_cast(0); + } + } + // "row" tail - row is out of range so fill with zeros always. + TOut zeroval = static_cast(0); + int pads = blank_rows * (fill_cols + blank_cols); + + for (int i=0; i + static void Transform(T* out, const T* const in, const int stride, + const int k0, const int kmax, const int x0, const int xmax) { + Transform(out, in, stride, k0, kmax, x0, xmax); + } +}; + +/*****************************************************************************/ +template +void Transform( + TOut* out, const TIn* const in, const int stride, + const int k0, const int kmax, const int x0, const int xmax +) { + // Redirect to a specialised implementation predicated on argument size. + TransformImpl::Transform( + out, in, stride, k0, kmax, x0, xmax + ); +} +/*****************************************************************************/ + +#include "transforms/list.hpp" + +// We don't have assembler transforms for AArch32, generate templated ones here. +#ifdef __arm__ +template void Transform<8, 1, true, VLType::None>(float *, const float *, int, int, int, int, int); +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +template void Transform<8, 1, true, VLType::None>(float *, const __fp16 *, int, int, int, int, int); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +template void Transform<8, 1, true, VLType::None>(float *, const bfloat16 *, int, int, int, int, int); +#endif // AArch32 + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp index 5efeee5d35..f46e6c5fa3 100644 --- a/src/core/NEON/kernels/arm_gemm/transform.hpp +++ b/src/core/NEON/kernels/arm_gemm/transform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,96 +27,10 @@ namespace arm_gemm { -/* - * Generic transform. - * - * Assuming the untransposed case, this works by first reading - * consecutive values from the first input row. This same number of values - * are then read from the next rows. Now return to the first - * input row and repeat. - * - * Need to cope with the work requested in either dimension not actually - * being a multiple of the block sizes. - */ -template -struct TransformImpl { - template - static void Transform(TOut* out, const TIn* const in, const int stride, - const int y0, const int ymax, const int x0, const int xmax) { - // For SVE cases we multiply the interleave factor by the vector length. - const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length() / BlockBy : 1); - - const int n_whole_y_blocks = (ymax - y0) / IntBy; - const int y_remainders = (ymax - y0) % IntBy; - const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0); - - const int n_whole_x_blocks = (xmax - x0) / BlockBy; - const int x_remainders = (xmax - x0) % BlockBy; - const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0); - - // "Y" loop: advance down the rows of the source IntBy rows at a time. - // Set up fill_rows to show the number rows to copy from, and blank_rows - // for the number of blank rows to add. - for (int y_block=0 ; y_block < n_y_blocks; y_block++) { - int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders; - int blank_rows = IntBy - fill_rows; - - int y_base = y0 + (y_block * IntBy); - - // So now advance along this block of rows, BlockBy columns at a time. - for (int x_block=0 ; x_block < n_x_blocks; x_block++) { - int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders; - int blank_cols = BlockBy - fill_cols; - - int x_base = x0 + (x_block * BlockBy); - - for (int row = 0; row < fill_rows; row++) { - for (int col = 0; col < fill_cols; col++) { - // In-range copy. If it's transposed, we reverse the sense of rows and columns here. - if (Transposed) { - *out++ = static_cast(in[(x_base + col) * stride + y_base + row]); - } else { - *out++ = static_cast(in[(y_base + row) * stride + x_base + col]); - } - } - // "col" tail - row is in range but column is out of range. - for (int col=0; col < blank_cols; col++) { - *out++ = static_cast(0); - } - } - // "row" tail - row is out of range so fill with zeros always. - TOut zeroval = static_cast(0); - int pads = blank_rows * (fill_cols + blank_cols); - - for (int i=0; i - static inline void Transform(T* out, const T* const in, const int stride, - const int k0, const int kmax, const int x0, const int xmax) { - Transform(out, in, stride, k0, kmax, x0, xmax); - } -}; - -/*****************************************************************************/ template void Transform( TOut* out, const TIn* const in, const int stride, const int k0, const int kmax, const int x0, const int xmax -) { - // Redirect to a specialised implementation predicated on argument size. - TransformImpl::Transform( - out, in, stride, k0, kmax, x0, xmax - ); -} -/*****************************************************************************/ - -#include "transforms/list.hpp" +); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp index 3ce1d328a7..b50c240a3a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,7 @@ // Generic unblocked transposed 8x32-bit sized specialisation template <> template -inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform( +void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { @@ -45,7 +45,7 @@ inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform( // Generic 16x16-bit sized specialisation template <> template -inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( +void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { @@ -59,7 +59,7 @@ inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( // Specialised 16 x uint16_t version template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { +void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { __asm volatile ( "VLD1.32 {d0-d3}, [%[in0]]!\n" "VST1.32 {d0-d3}, [%[out]]\n" @@ -72,7 +72,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(con } template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { +void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { __asm volatile ( "VLD1.32 {d0-d3}, [%[in0]]!\n" "VST1.32 {d0-d3}, [%[out]]!\n" @@ -90,7 +90,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(con } template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { +void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { __asm __volatile ( "VLD1.32 {d0-d3}, [%[in0]]!\n" "VST1.32 {d0-d3}, [%[out]]!\n" @@ -117,7 +117,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con template <> template <> -inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( +void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( uint16_t* out, const uint16_t* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp new file mode 100644 index 0000000000..41c1c282e5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 32 * height * sizeof(uint32_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q15, [x24], #0x10\n" + "sub x19, x19, #0x20\n" + "ldr q14, [x22], #0x10\n" + "cmp x19, #0x20\n" + "ldr q13, [x21], #0x10\n" + "ldr q12, [x20], #0x10\n" + "ldr q11, [x24], #0x10\n" + "ldr q10, [x22], #0x10\n" + "ldr q9, [x21], #0x10\n" + "ldr q8, [x20], #0x10\n" + "ldr q7, [x24], #0x10\n" + "ldr q6, [x22], #0x10\n" + "ldr q5, [x21], #0x10\n" + "ldr q4, [x20], #0x10\n" + "ldr q3, [x24], #0x10\n" + "ldr q2, [x22], #0x10\n" + "ldr q1, [x21], #0x10\n" + "ldr q0, [x20], #0x10\n" + "ldr q31, [x24], #0x10\n" + "ldr q30, [x22], #0x10\n" + "ldr q29, [x21], #0x10\n" + "ldr q28, [x20], #0x10\n" + "ldr q27, [x24], #0x10\n" + "ldr q26, [x22], #0x10\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x22], #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q15, [x23, #0x0]\n" + "str q11, [x23, #0x10]\n" + "str q7, [x23, #0x20]\n" + "str q3, [x23, #0x30]\n" + "str q31, [x23, #0x40]\n" + "str q27, [x23, #0x50]\n" + "str q23, [x23, #0x60]\n" + "str q19, [x23, #0x70]\n" + "str q14, [x23, #0x80]\n" + "str q10, [x23, #0x90]\n" + "str q6, [x23, #0xa0]\n" + "str q2, [x23, #0xb0]\n" + "str q30, [x23, #0xc0]\n" + "str q26, [x23, #0xd0]\n" + "str q22, [x23, #0xe0]\n" + "str q18, [x23, #0xf0]\n" + "str q13, [x23, #0x100]\n" + "str q9, [x23, #0x110]\n" + "str q5, [x23, #0x120]\n" + "str q1, [x23, #0x130]\n" + "str q29, [x23, #0x140]\n" + "str q25, [x23, #0x150]\n" + "str q21, [x23, #0x160]\n" + "str q17, [x23, #0x170]\n" + "str q12, [x23, #0x180]\n" + "str q8, [x23, #0x190]\n" + "str q4, [x23, #0x1a0]\n" + "str q0, [x23, #0x1b0]\n" + "str q28, [x23, #0x1c0]\n" + "str q24, [x23, #0x1d0]\n" + "str q20, [x23, #0x1e0]\n" + "str q16, [x23, #0x1f0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: width 16 loop: loop + "ldr q31, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q30, [x22], #0x10\n" + "cmp x19, #0x10\n" + "ldr q29, [x21], #0x10\n" + "ldr q28, [x20], #0x10\n" + "ldr q27, [x24], #0x10\n" + "ldr q26, [x22], #0x10\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x22], #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q31, [x23, #0x0]\n" + "str q27, [x23, #0x10]\n" + "str q23, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q30, [x23, #0x80]\n" + "str q26, [x23, #0x90]\n" + "str q22, [x23, #0xa0]\n" + "str q18, [x23, #0xb0]\n" + "str q29, [x23, #0x100]\n" + "str q25, [x23, #0x110]\n" + "str q21, [x23, #0x120]\n" + "str q17, [x23, #0x130]\n" + "str q28, [x23, #0x180]\n" + "str q24, [x23, #0x190]\n" + "str q20, [x23, #0x1a0]\n" + "str q16, [x23, #0x1b0]\n" + "add x23, x23, #0x40\n" + "bge 4b\n" + "5:" // Main row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr q19, [x24], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x22], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q19, [x23, #0x0]\n" + "str q18, [x23, #0x80]\n" + "str q17, [x23, #0x100]\n" + "str q16, [x23, #0x180]\n" + "add x23, x23, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr s19, [x24], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s18, [x22], #0x4\n" + "cmp x19, #0x1\n" + "ldr s17, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "str s19, [x23, #0x0]\n" + "str s18, [x23, #0x80]\n" + "str s17, [x23, #0x100]\n" + "str s16, [x23, #0x180]\n" + "add x23, x23, #0x4\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x200\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr q23, [x24], #0x10\n" + "sub x19, x19, #0x20\n" + "cmp x19, #0x20\n" + "ldr q22, [x24], #0x10\n" + "ldr q21, [x24], #0x10\n" + "ldr q20, [x24], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q23, [x23, #0x0]\n" + "str q22, [x23, #0x10]\n" + "str q21, [x23, #0x20]\n" + "str q20, [x23, #0x30]\n" + "str q19, [x23, #0x40]\n" + "str q18, [x23, #0x50]\n" + "str q17, [x23, #0x60]\n" + "str q16, [x23, #0x70]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 15f\n" + "14:" // Tail row loop: width 16 loop: loop + "ldr q19, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "cmp x19, #0x10\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q19, [x23, #0x0]\n" + "str q18, [x23, #0x10]\n" + "str q17, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "add x23, x23, #0x40\n" + "bge 14b\n" + "15:" // Tail row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr q16, [x24], #0x10\n" + "sub x19, x19, #0x4\n" + "cmp x19, #0x4\n" + "str q16, [x23, #0x0]\n" + "add x23, x23, #0x10\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr s16, [x24], #0x4\n" + "sub x19, x19, #0x1\n" + "cmp x19, #0x1\n" + "str s16, [x23, #0x0]\n" + "add x23, x23, #0x4\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x80\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace + +template<> +void Transform<32, 1, true, VLType::None>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_128( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 4, + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp new file mode 100644 index 0000000000..ec3273a526 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_12_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 12 * roundup(height, 4) * sizeof(uint8_t); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x30\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0x30\n" + "ldr q23, [x26], #0x10\n" + "cmp x19, #0x30\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr q17, [x28], #0x10\n" + "zip2 v22.16b, v18.16b, v16.16b\n" + "ldr q11, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v21.16b, v17.16b, v16.16b\n" + "ldr q18, [x28], #0x10\n" + "zip2 v10.16b, v17.16b, v16.16b\n" + "ldr q9, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v8.16b, v18.16b, v17.16b\n" + "ldr q16, [x24], #0x10\n" + "zip2 v7.16b, v18.16b, v17.16b\n" + "ldr q20, [x23], #0x10\n" + "ldr q6, [x22], #0x10\n" + "zip1 v17.16b, v23.16b, v16.16b\n" + "ldr q5, [x24], #0x10\n" + "zip2 v16.16b, v23.16b, v16.16b\n" + "ldr q4, [x23], #0x10\n" + "zip1 v3.16b, v19.16b, v17.16b\n" + "ldr q2, [x22], #0x10\n" + "zip2 v1.16b, v19.16b, v17.16b\n" + "ldr q19, [x21], #0x10\n" + "zip1 v0.16b, v22.16b, v16.16b\n" + "ldr q31, [x24], #0x10\n" + "zip2 v30.16b, v22.16b, v16.16b\n" + "ldr q29, [x23], #0x10\n" + "zip1 v16.16b, v11.16b, v5.16b\n" + "ldr q28, [x22], #0x10\n" + "zip1 v27.16b, v21.16b, v16.16b\n" + "ldr q26, [x21], #0x10\n" + "zip1 v18.16b, v20.16b, v19.16b\n" + "ldr q17, [x20], #0x10\n" + "zip2 v20.16b, v20.16b, v19.16b\n" + "ldr q25, [x21], #0x10\n" + "zip2 v24.16b, v21.16b, v16.16b\n" + "zip1 v23.16b, v4.16b, v26.16b\n" + "ldr q22, [x20], #0x10\n" + "zip1 v16.16b, v6.16b, v17.16b\n" + "ldr q21, [x20], #0x10\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "zip2 v18.16b, v18.16b, v16.16b\n" + "str q3, [x27, #0x0]\n" + "zip2 v16.16b, v6.16b, v17.16b\n" + "str q1, [x27, #0x10]\n" + "zip1 v17.16b, v20.16b, v16.16b\n" + "str q0, [x27, #0x20]\n" + "zip2 v20.16b, v20.16b, v16.16b\n" + "str q19, [x27, #0x30]\n" + "zip1 v16.16b, v2.16b, v22.16b\n" + "str q18, [x27, #0x40]\n" + "zip1 v19.16b, v23.16b, v16.16b\n" + "str q17, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v18.16b, v23.16b, v16.16b\n" + "str q30, [x27, #0x0]\n" + "zip2 v17.16b, v11.16b, v5.16b\n" + "str q27, [x27, #0x10]\n" + "zip1 v16.16b, v10.16b, v17.16b\n" + "str q24, [x27, #0x20]\n" + "zip2 v17.16b, v10.16b, v17.16b\n" + "str q20, [x27, #0x30]\n" + "zip1 v20.16b, v9.16b, v31.16b\n" + "str q19, [x27, #0x40]\n" + "zip1 v19.16b, v8.16b, v20.16b\n" + "str q18, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v18.16b, v4.16b, v26.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v2.16b, v22.16b\n" + "str q17, [x27, #0x10]\n" + "zip1 v17.16b, v18.16b, v16.16b\n" + "str q19, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v16.16b\n" + "str q17, [x27, #0x30]\n" + "zip1 v19.16b, v29.16b, v25.16b\n" + "str q16, [x27, #0x40]\n" + "zip1 v17.16b, v28.16b, v21.16b\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v16.16b, v8.16b, v20.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v18.16b, v9.16b, v31.16b\n" + "zip2 v17.16b, v19.16b, v17.16b\n" + "zip1 v16.16b, v7.16b, v18.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v16.16b, v7.16b, v18.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v18.16b, v29.16b, v25.16b\n" + "str q17, [x27, #0x30]\n" + "zip2 v17.16b, v28.16b, v21.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr d19, [x28], #0x8\n" + "sub x19, x19, #0xc\n" + "ldr d18, [x26], #0x8\n" + "cmp x19, #0xc\n" + "ldr d17, [x25], #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ld1 { v19.s }[2], [x28], #0x4\n" + "ld1 { v18.s }[2], [x26], #0x4\n" + "ld1 { v17.s }[2], [x25], #0x4\n" + "zip1 v23.16b, v19.16b, v17.16b\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "zip2 v20.16b, v19.16b, v17.16b\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ldr d22, [x22], #0x8\n" + "zip1 v17.16b, v18.16b, v16.16b\n" + "ldr d19, [x21], #0x8\n" + "zip2 v16.16b, v18.16b, v16.16b\n" + "ld1 { v22.s }[2], [x22], #0x4\n" + "zip1 v18.16b, v23.16b, v17.16b\n" + "ldr d21, [x20], #0x8\n" + "zip2 v17.16b, v23.16b, v17.16b\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "ld1 { v21.s }[2], [x20], #0x4\n" + "zip1 v20.16b, v24.16b, v19.16b\n" + "str q18, [x27, #0x0]\n" + "zip2 v19.16b, v24.16b, v19.16b\n" + "str q17, [x27, #0x10]\n" + "str q16, [x27, #0x20]\n" + "zip1 v18.16b, v22.16b, v21.16b\n" + "zip2 v17.16b, v22.16b, v21.16b\n" + "zip1 v16.16b, v20.16b, v18.16b\n" + "str q16, [x27, #0x30]\n" + "zip2 v16.16b, v20.16b, v18.16b\n" + "str q16, [x27, #0x40]\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr s18, [x28], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s17, [x26], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x25], #0x4\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr s19, [x22], #0x4\n" + "ldr s17, [x21], #0x4\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr s16, [x20], #0x4\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str q18, [x27, #0x0]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x27, #0x30]\n" + "add x27, x27, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr b18, [x28], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b17, [x26], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x25], #0x1\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr b16, [x24], #0x1\n" + "ldr b20, [x23], #0x1\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr b19, [x22], #0x1\n" + "ldr b17, [x21], #0x1\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr b16, [x20], #0x1\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str s18, [x27, #0x0]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x27, #0x30]\n" + "add x27, x27, #0x4\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x30\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0x30\n" + "ldr q19, [x26], #0x10\n" + "cmp x19, #0x30\n" + "ldr q16, [x25], #0x10\n" + "zip1 v28.16b, v18.16b, v16.16b\n" + "ldr q17, [x28], #0x10\n" + "zip2 v27.16b, v18.16b, v16.16b\n" + "ldr q26, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v25.16b, v17.16b, v16.16b\n" + "ldr q18, [x28], #0x10\n" + "zip2 v24.16b, v17.16b, v16.16b\n" + "ldr q23, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v22.16b, v18.16b, v16.16b\n" + "ldr q17, [x24], #0x10\n" + "zip2 v21.16b, v18.16b, v16.16b\n" + "ldr q20, [x24], #0x10\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "zip2 v18.16b, v19.16b, v17.16b\n" + "ldr q19, [x24], #0x10\n" + "zip1 v17.16b, v28.16b, v16.16b\n" + "zip2 v16.16b, v28.16b, v16.16b\n" + "str q17, [x27, #0x0]\n" + "zip1 v17.16b, v27.16b, v18.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v16.16b, v27.16b, v18.16b\n" + "str q17, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "zip1 v18.16b, v26.16b, v20.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v17.16b, v26.16b, v20.16b\n" + "zip1 v16.16b, v25.16b, v18.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v16.16b, v25.16b, v18.16b\n" + "str q16, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "zip1 v16.16b, v24.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v24.16b, v17.16b\n" + "zip1 v17.16b, v23.16b, v19.16b\n" + "str q16, [x27, #0x10]\n" + "zip1 v16.16b, v22.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v16.16b, v22.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v17.16b, v23.16b, v19.16b\n" + "zip1 v16.16b, v21.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v16.16b, v21.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr d18, [x28], #0x8\n" + "sub x19, x19, #0xc\n" + "ldr d21, [x26], #0x8\n" + "cmp x19, #0xc\n" + "ldr d17, [x25], #0x8\n" + "ldr d16, [x24], #0x8\n" + "ld1 { v18.s }[2], [x28], #0x4\n" + "ld1 { v21.s }[2], [x26], #0x4\n" + "ld1 { v17.s }[2], [x25], #0x4\n" + "zip1 v20.16b, v18.16b, v17.16b\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "zip2 v19.16b, v18.16b, v17.16b\n" + "zip1 v18.16b, v21.16b, v16.16b\n" + "zip2 v17.16b, v21.16b, v16.16b\n" + "zip1 v16.16b, v20.16b, v18.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v20.16b, v18.16b\n" + "str q16, [x27, #0x10]\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr s17, [x28], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s18, [x26], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x25], #0x4\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr s16, [x24], #0x4\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x27, #0x0]\n" + "add x27, x27, #0x10\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr b17, [x28], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b18, [x26], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x25], #0x1\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr b16, [x24], #0x1\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x27, #0x0]\n" + "add x27, x27, #0x4\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x30\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<12, 4, true, VLType::None>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<12, 4, true, VLType::None>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp new file mode 100644 index 0000000000..1603be2ef8 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 8) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 12 * roundup(height, 8) * sizeof(uint8_t); + + __asm__ __volatile__( + + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "cmp %x[height], #0x7\n" + "csel x20, x20, %x[pad_row], GT\n" + "csel x21, x21, %x[pad_row], GE\n" + "cmp %x[height], #0x5\n" + "csel x22, x22, %x[pad_row], GT\n" + "csel x23, x23, %x[pad_row], GE\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x30\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0x30\n" + "ldr q19, [x26], #0x10\n" + "cmp x19, #0x30\n" + "ldr q11, [x25], #0x10\n" + "ldr q10, [x24], #0x10\n" + "ldr q16, [x23], #0x10\n" + "zip1 v22.16b, v18.16b, v16.16b\n" + "ldr q17, [x28], #0x10\n" + "zip2 v9.16b, v18.16b, v16.16b\n" + "ldr q8, [x26], #0x10\n" + "ldr q7, [x25], #0x10\n" + "ldr q6, [x24], #0x10\n" + "ldr q16, [x23], #0x10\n" + "zip1 v5.16b, v17.16b, v16.16b\n" + "ldr q18, [x28], #0x10\n" + "zip2 v4.16b, v17.16b, v16.16b\n" + "ldr q3, [x26], #0x10\n" + "ldr q2, [x25], #0x10\n" + "ldr q1, [x24], #0x10\n" + "ldr q17, [x23], #0x10\n" + "zip1 v0.16b, v18.16b, v17.16b\n" + "ldr q16, [x22], #0x10\n" + "zip2 v31.16b, v18.16b, v17.16b\n" + "ldr q30, [x21], #0x10\n" + "ldr q29, [x20], #0x10\n" + "zip1 v28.16b, v19.16b, v16.16b\n" + "ldr q27, [x22], #0x10\n" + "zip2 v21.16b, v19.16b, v16.16b\n" + "ldr q26, [x21], #0x10\n" + "zip1 v16.16b, v11.16b, v30.16b\n" + "ldr q25, [x20], #0x10\n" + "zip1 v20.16b, v22.16b, v16.16b\n" + "ldr q24, [x22], #0x10\n" + "zip1 v19.16b, v10.16b, v29.16b\n" + "zip2 v18.16b, v22.16b, v16.16b\n" + "ldr q23, [x21], #0x10\n" + "zip1 v17.16b, v28.16b, v19.16b\n" + "ldr q22, [x20], #0x10\n" + "zip1 v16.16b, v20.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v20.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v17.16b, v28.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip2 v20.16b, v11.16b, v30.16b\n" + "zip1 v18.16b, v9.16b, v20.16b\n" + "zip2 v19.16b, v10.16b, v29.16b\n" + "zip1 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v18.16b, v9.16b, v20.16b\n" + "zip2 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip1 v21.16b, v7.16b, v26.16b\n" + "zip1 v18.16b, v5.16b, v21.16b\n" + "zip1 v20.16b, v8.16b, v27.16b\n" + "zip1 v19.16b, v6.16b, v25.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip2 v18.16b, v5.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v21.16b, v7.16b, v26.16b\n" + "zip2 v20.16b, v8.16b, v27.16b\n" + "zip1 v18.16b, v4.16b, v21.16b\n" + "zip2 v19.16b, v6.16b, v25.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v18.16b, v4.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip1 v21.16b, v2.16b, v23.16b\n" + "zip1 v18.16b, v0.16b, v21.16b\n" + "zip1 v20.16b, v3.16b, v24.16b\n" + "zip1 v19.16b, v1.16b, v22.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v18.16b, v0.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v21.16b, v2.16b, v23.16b\n" + "zip1 v18.16b, v31.16b, v21.16b\n" + "zip2 v20.16b, v3.16b, v24.16b\n" + "zip2 v19.16b, v1.16b, v22.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip2 v18.16b, v31.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr d20, [x28], #0x8\n" + "sub x19, x19, #0xc\n" + "ldr d19, [x26], #0x8\n" + "cmp x19, #0xc\n" + "ldr d18, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d16, [x23], #0x8\n" + "ld1 { v20.s }[2], [x28], #0x4\n" + "ld1 { v19.s }[2], [x26], #0x4\n" + "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "ld1 { v16.s }[2], [x23], #0x4\n" + "zip1 v26.16b, v20.16b, v16.16b\n" + "ldr d17, [x22], #0x8\n" + "zip2 v25.16b, v20.16b, v16.16b\n" + "ldr d16, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "ld1 { v17.s }[2], [x22], #0x4\n" + "zip1 v23.16b, v19.16b, v17.16b\n" + "ld1 { v16.s }[2], [x21], #0x4\n" + "zip2 v22.16b, v19.16b, v17.16b\n" + "ld1 { v24.s }[2], [x20], #0x4\n" + "zip1 v21.16b, v18.16b, v16.16b\n" + "zip2 v20.16b, v18.16b, v16.16b\n" + "zip1 v18.16b, v26.16b, v21.16b\n" + "zip1 v19.16b, v27.16b, v24.16b\n" + "zip1 v17.16b, v23.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v18.16b, v26.16b, v21.16b\n" + "zip2 v17.16b, v23.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip1 v18.16b, v25.16b, v20.16b\n" + "zip2 v16.16b, v27.16b, v24.16b\n" + "zip1 v17.16b, v22.16b, v16.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr s17, [x28], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s21, [x26], #0x4\n" + "cmp x19, #0x4\n" + "ldr s18, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "ldr s16, [x23], #0x4\n" + "zip1 v19.16b, v17.16b, v16.16b\n" + "ldr s17, [x22], #0x4\n" + "ldr s16, [x21], #0x4\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr s16, [x20], #0x4\n" + "zip1 v17.16b, v21.16b, v17.16b\n" + "zip1 v18.16b, v19.16b, v18.16b\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "add x27, x27, #0x20\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr b18, [x28], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b21, [x26], #0x1\n" + "cmp x19, #0x1\n" + "ldr b17, [x25], #0x1\n" + "ldr b20, [x24], #0x1\n" + "ldr b16, [x23], #0x1\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr b18, [x22], #0x1\n" + "ldr b16, [x21], #0x1\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr b16, [x20], #0x1\n" + "zip1 v18.16b, v21.16b, v18.16b\n" + "zip1 v17.16b, v19.16b, v17.16b\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x1\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<12, 8, true, VLType::None>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<12, 8, true, VLType::None>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp new file mode 100644 index 0000000000..78301353fd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 12 * roundup(height, 2) * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q16, [x26], #0x10\n" + "zip1 v9.8h, v17.8h, v16.8h\n" + "ldr q19, [x28], #0x10\n" + "cmp x19, #0x18\n" + "zip2 v8.8h, v17.8h, v16.8h\n" + "ldr q16, [x26], #0x10\n" + "ldr q18, [x25], #0x10\n" + "zip1 v7.8h, v19.8h, v16.8h\n" + "ldr q17, [x28], #0x10\n" + "zip2 v6.8h, v19.8h, v16.8h\n" + "ldr q16, [x26], #0x10\n" + "ldr q20, [x25], #0x10\n" + "zip1 v5.8h, v17.8h, v16.8h\n" + "ldr q22, [x25], #0x10\n" + "zip2 v4.8h, v17.8h, v16.8h\n" + "ldr q16, [x24], #0x10\n" + "ldr q19, [x23], #0x10\n" + "zip1 v3.8h, v18.8h, v16.8h\n" + "ldr q17, [x24], #0x10\n" + "zip2 v2.8h, v18.8h, v16.8h\n" + "ldr q21, [x23], #0x10\n" + "ldr q18, [x22], #0x10\n" + "zip1 v1.8h, v20.8h, v17.8h\n" + "ldr q16, [x24], #0x10\n" + "zip2 v0.8h, v20.8h, v17.8h\n" + "ldr q31, [x23], #0x10\n" + "zip1 v30.8h, v19.8h, v18.8h\n" + "ldr q17, [x22], #0x10\n" + "zip2 v29.8h, v19.8h, v18.8h\n" + "ldr q20, [x21], #0x10\n" + "ldr q19, [x20], #0x10\n" + "zip1 v28.8h, v22.8h, v16.8h\n" + "zip2 v27.8h, v22.8h, v16.8h\n" + "ldr q16, [x22], #0x10\n" + "zip1 v26.8h, v21.8h, v17.8h\n" + "zip2 v25.8h, v21.8h, v17.8h\n" + "ldr q18, [x21], #0x10\n" + "zip1 v24.8h, v20.8h, v19.8h\n" + "ldr q17, [x20], #0x10\n" + "zip2 v23.8h, v20.8h, v19.8h\n" + "ldr q22, [x21], #0x10\n" + "zip1 v21.8h, v31.8h, v16.8h\n" + "zip2 v20.8h, v31.8h, v16.8h\n" + "ldr q16, [x20], #0x10\n" + "zip1 v19.8h, v18.8h, v17.8h\n" + "str q9, [x27, #0x0]\n" + "zip2 v18.8h, v18.8h, v17.8h\n" + "str q8, [x27, #0x10]\n" + "str q7, [x27, #0x20]\n" + "zip1 v17.8h, v22.8h, v16.8h\n" + "str q3, [x27, #0x30]\n" + "zip2 v16.8h, v22.8h, v16.8h\n" + "str q2, [x27, #0x40]\n" + "str q1, [x27, #0x50]\n" + "str q30, [x27, #0x60]\n" + "str q29, [x27, #0x70]\n" + "str q26, [x27, #0x80]\n" + "str q24, [x27, #0x90]\n" + "str q23, [x27, #0xa0]\n" + "str q19, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "str q6, [x27, #0x0]\n" + "str q5, [x27, #0x10]\n" + "str q4, [x27, #0x20]\n" + "str q0, [x27, #0x30]\n" + "str q28, [x27, #0x40]\n" + "str q27, [x27, #0x50]\n" + "str q25, [x27, #0x60]\n" + "str q21, [x27, #0x70]\n" + "str q20, [x27, #0x80]\n" + "str q18, [x27, #0x90]\n" + "str q17, [x27, #0xa0]\n" + "str q16, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0xc\n" + "ldr q16, [x26], #0x10\n" + "zip1 v29.8h, v18.8h, v16.8h\n" + "ldr d17, [x28], #0x8\n" + "cmp x19, #0xc\n" + "zip2 v28.8h, v18.8h, v16.8h\n" + "ldr d16, [x26], #0x8\n" + "ldr q19, [x25], #0x10\n" + "zip1 v27.8h, v17.8h, v16.8h\n" + "ldr d18, [x25], #0x8\n" + "ldr q17, [x24], #0x10\n" + "zip1 v26.8h, v19.8h, v17.8h\n" + "ldr d16, [x24], #0x8\n" + "zip2 v25.8h, v19.8h, v17.8h\n" + "ldr q19, [x23], #0x10\n" + "ldr q17, [x22], #0x10\n" + "zip1 v24.8h, v18.8h, v16.8h\n" + "ldr q18, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "zip1 v23.8h, v19.8h, v17.8h\n" + "zip2 v22.8h, v19.8h, v17.8h\n" + "ldr d21, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "zip1 v20.8h, v18.8h, v16.8h\n" + "ldr d19, [x21], #0x8\n" + "zip2 v18.8h, v18.8h, v16.8h\n" + "ldr d16, [x20], #0x8\n" + "str q29, [x27, #0x0]\n" + "zip1 v17.8h, v21.8h, v17.8h\n" + "str q28, [x27, #0x10]\n" + "zip1 v16.8h, v19.8h, v16.8h\n" + "str q27, [x27, #0x20]\n" + "str q26, [x27, #0x30]\n" + "str q25, [x27, #0x40]\n" + "str q24, [x27, #0x50]\n" + "str q23, [x27, #0x60]\n" + "str q22, [x27, #0x70]\n" + "str q17, [x27, #0x80]\n" + "str q20, [x27, #0x90]\n" + "str q18, [x27, #0xa0]\n" + "str q16, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d17, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d16, [x26], #0x8\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "ldr d17, [x25], #0x8\n" + "cmp x19, #0x4\n" + "ldr d16, [x24], #0x8\n" + "zip1 v19.8h, v17.8h, v16.8h\n" + "ldr d17, [x23], #0x8\n" + "ldr d16, [x22], #0x8\n" + "zip1 v18.8h, v17.8h, v16.8h\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q20, [x27, #0x0]\n" + "str q19, [x27, #0x30]\n" + "str q18, [x27, #0x60]\n" + "str q16, [x27, #0x90]\n" + "add x27, x27, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h17, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h16, [x26], #0x2\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "ldr h17, [x25], #0x2\n" + "cmp x19, #0x1\n" + "ldr h16, [x24], #0x2\n" + "zip1 v19.8h, v17.8h, v16.8h\n" + "ldr h17, [x23], #0x2\n" + "ldr h16, [x22], #0x2\n" + "zip1 v18.8h, v17.8h, v16.8h\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str s20, [x27, #0x0]\n" + "str s19, [x27, #0x30]\n" + "str s18, [x27, #0x60]\n" + "str s16, [x27, #0x90]\n" + "add x27, x27, #0x4\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0xc0\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add %x[in], x26, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q16, [x26], #0x10\n" + "zip1 v22.8h, v17.8h, v16.8h\n" + "ldr q18, [x28], #0x10\n" + "cmp x19, #0x18\n" + "zip2 v21.8h, v17.8h, v16.8h\n" + "ldr q17, [x26], #0x10\n" + "ldr q20, [x28], #0x10\n" + "zip1 v19.8h, v18.8h, v17.8h\n" + "ldr q16, [x26], #0x10\n" + "zip2 v18.8h, v18.8h, v17.8h\n" + "str q22, [x27, #0x0]\n" + "str q21, [x27, #0x10]\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "str q19, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "str q18, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q16, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0xc\n" + "ldr q16, [x26], #0x10\n" + "zip1 v19.8h, v17.8h, v16.8h\n" + "ldr d18, [x28], #0x8\n" + "cmp x19, #0xc\n" + "zip2 v17.8h, v17.8h, v16.8h\n" + "ldr d16, [x26], #0x8\n" + "str q19, [x27, #0x0]\n" + "zip1 v16.8h, v18.8h, v16.8h\n" + "str q17, [x27, #0x10]\n" + "str q16, [x27, #0x20]\n" + "add x27, x27, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d17, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d16, [x26], #0x8\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q16, [x27, #0x0]\n" + "add x27, x27, #0x10\n" + "cmp x19, #0x4\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h17, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h16, [x26], #0x2\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str s16, [x27, #0x0]\n" + "add x27, x27, #0x4\n" + "cmp x19, #0x1\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x30\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<12, 2, true, VLType::None>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp new file mode 100644 index 0000000000..7e8ca6648d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 12 * roundup(height, 4) * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q23, [x26], #0x10\n" + "cmp x19, #0x18\n" + "ldr q16, [x25], #0x10\n" + "zip1 v22.8h, v18.8h, v16.8h\n" + "ldr q17, [x28], #0x10\n" + "zip2 v21.8h, v18.8h, v16.8h\n" + "ldr q12, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "ldr q18, [x28], #0x10\n" + "zip2 v11.8h, v17.8h, v16.8h\n" + "ldr q10, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v9.8h, v18.8h, v17.8h\n" + "ldr q16, [x24], #0x10\n" + "zip2 v8.8h, v18.8h, v17.8h\n" + "ldr q19, [x23], #0x10\n" + "ldr q7, [x22], #0x10\n" + "zip1 v17.8h, v23.8h, v16.8h\n" + "ldr q6, [x24], #0x10\n" + "zip2 v16.8h, v23.8h, v16.8h\n" + "ldr q5, [x23], #0x10\n" + "zip1 v4.8h, v22.8h, v17.8h\n" + "ldr q3, [x22], #0x10\n" + "zip2 v2.8h, v22.8h, v17.8h\n" + "ldr q18, [x21], #0x10\n" + "zip1 v1.8h, v21.8h, v16.8h\n" + "ldr q0, [x24], #0x10\n" + "zip2 v31.8h, v21.8h, v16.8h\n" + "ldr q30, [x23], #0x10\n" + "zip1 v16.8h, v12.8h, v6.8h\n" + "ldr q29, [x22], #0x10\n" + "zip1 v28.8h, v20.8h, v16.8h\n" + "ldr q27, [x21], #0x10\n" + "zip2 v26.8h, v20.8h, v16.8h\n" + "ldr q21, [x20], #0x10\n" + "zip1 v17.8h, v19.8h, v18.8h\n" + "ldr q25, [x21], #0x10\n" + "zip2 v19.8h, v19.8h, v18.8h\n" + "zip1 v18.8h, v5.8h, v27.8h\n" + "ldr q24, [x20], #0x10\n" + "zip1 v16.8h, v7.8h, v21.8h\n" + "ldr q23, [x20], #0x10\n" + "zip1 v22.8h, v17.8h, v16.8h\n" + "zip2 v20.8h, v17.8h, v16.8h\n" + "str q4, [x27, #0x0]\n" + "zip2 v16.8h, v7.8h, v21.8h\n" + "str q2, [x27, #0x10]\n" + "zip1 v17.8h, v19.8h, v16.8h\n" + "str q1, [x27, #0x20]\n" + "zip2 v21.8h, v19.8h, v16.8h\n" + "str q31, [x27, #0x30]\n" + "zip1 v16.8h, v3.8h, v24.8h\n" + "str q28, [x27, #0x40]\n" + "zip1 v19.8h, v18.8h, v16.8h\n" + "str q26, [x27, #0x50]\n" + "zip2 v18.8h, v18.8h, v16.8h\n" + "str q22, [x27, #0x60]\n" + "zip2 v16.8h, v12.8h, v6.8h\n" + "str q20, [x27, #0x70]\n" + "zip1 v20.8h, v11.8h, v16.8h\n" + "str q17, [x27, #0x80]\n" + "zip2 v17.8h, v11.8h, v16.8h\n" + "str q21, [x27, #0x90]\n" + "zip1 v16.8h, v10.8h, v0.8h\n" + "str q19, [x27, #0xa0]\n" + "zip1 v19.8h, v9.8h, v16.8h\n" + "str q18, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v18.8h, v9.8h, v16.8h\n" + "str q20, [x27, #0x0]\n" + "zip2 v16.8h, v10.8h, v0.8h\n" + "str q17, [x27, #0x10]\n" + "zip1 v17.8h, v8.8h, v16.8h\n" + "str q19, [x27, #0x20]\n" + "zip2 v16.8h, v8.8h, v16.8h\n" + "str q18, [x27, #0x30]\n" + "zip2 v18.8h, v5.8h, v27.8h\n" + "str q17, [x27, #0x40]\n" + "zip2 v17.8h, v3.8h, v24.8h\n" + "str q16, [x27, #0x50]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x70]\n" + "zip1 v18.8h, v30.8h, v25.8h\n" + "zip1 v17.8h, v29.8h, v23.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x80]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x90]\n" + "zip2 v18.8h, v30.8h, v25.8h\n" + "zip2 v17.8h, v29.8h, v23.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xa0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0xc\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0xc\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.8h, v18.8h, v16.8h\n" + "ldr d17, [x28], #0x8\n" + "zip2 v23.8h, v18.8h, v16.8h\n" + "ldr d22, [x26], #0x8\n" + "ldr d16, [x25], #0x8\n" + "zip1 v21.8h, v17.8h, v16.8h\n" + "ldr q16, [x24], #0x10\n" + "ldr q31, [x23], #0x10\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "ldr d17, [x24], #0x8\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "ldr d30, [x23], #0x8\n" + "zip1 v29.8h, v19.8h, v18.8h\n" + "ldr q28, [x22], #0x10\n" + "zip2 v20.8h, v19.8h, v18.8h\n" + "ldr q27, [x21], #0x10\n" + "zip1 v19.8h, v23.8h, v16.8h\n" + "ldr q26, [x20], #0x10\n" + "zip2 v18.8h, v23.8h, v16.8h\n" + "ldr d25, [x22], #0x8\n" + "zip1 v16.8h, v22.8h, v17.8h\n" + "zip1 v24.8h, v21.8h, v16.8h\n" + "ldr d23, [x21], #0x8\n" + "zip2 v22.8h, v21.8h, v16.8h\n" + "ldr d21, [x20], #0x8\n" + "zip1 v17.8h, v31.8h, v27.8h\n" + "str q29, [x27, #0x0]\n" + "zip1 v16.8h, v28.8h, v26.8h\n" + "str q20, [x27, #0x10]\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "str q19, [x27, #0x20]\n" + "zip2 v19.8h, v17.8h, v16.8h\n" + "str q18, [x27, #0x30]\n" + "zip2 v18.8h, v31.8h, v27.8h\n" + "str q24, [x27, #0x40]\n" + "zip2 v16.8h, v28.8h, v26.8h\n" + "str q22, [x27, #0x50]\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q20, [x27, #0x60]\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q19, [x27, #0x70]\n" + "zip1 v18.8h, v30.8h, v23.8h\n" + "str q17, [x27, #0x80]\n" + "zip1 v17.8h, v25.8h, v21.8h\n" + "str q16, [x27, #0x90]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xa0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d17, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d18, [x26], #0x8\n" + "cmp x19, #0x4\n" + "ldr d16, [x25], #0x8\n" + "zip1 v17.8h, v17.8h, v16.8h\n" + "ldr d16, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "zip1 v16.8h, v18.8h, v16.8h\n" + "ldr d20, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "zip1 v18.8h, v17.8h, v16.8h\n" + "zip2 v17.8h, v17.8h, v16.8h\n" + "ldr d16, [x20], #0x8\n" + "str q18, [x27, #0x0]\n" + "zip1 v18.8h, v21.8h, v19.8h\n" + "str q17, [x27, #0x10]\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, #0x20\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h18, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h17, [x26], #0x2\n" + "cmp x19, #0x1\n" + "ldr h16, [x25], #0x2\n" + "zip1 v18.8h, v18.8h, v16.8h\n" + "ldr h16, [x24], #0x2\n" + "ldr h20, [x23], #0x2\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "ldr h19, [x22], #0x2\n" + "ldr h17, [x21], #0x2\n" + "zip1 v18.8h, v18.8h, v16.8h\n" + "ldr h16, [x20], #0x2\n" + "zip1 v17.8h, v20.8h, v17.8h\n" + "str d18, [x27, #0x0]\n" + "zip1 v16.8h, v19.8h, v16.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str d16, [x27, #0x60]\n" + "add x27, x27, #0x8\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0xc0\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q19, [x26], #0x10\n" + "cmp x19, #0x18\n" + "ldr q16, [x25], #0x10\n" + "zip1 v28.8h, v18.8h, v16.8h\n" + "ldr q17, [x28], #0x10\n" + "zip2 v27.8h, v18.8h, v16.8h\n" + "ldr q26, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v25.8h, v17.8h, v16.8h\n" + "ldr q18, [x28], #0x10\n" + "zip2 v24.8h, v17.8h, v16.8h\n" + "ldr q23, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v22.8h, v18.8h, v16.8h\n" + "ldr q17, [x24], #0x10\n" + "zip2 v21.8h, v18.8h, v16.8h\n" + "ldr q20, [x24], #0x10\n" + "zip1 v16.8h, v19.8h, v17.8h\n" + "zip2 v18.8h, v19.8h, v17.8h\n" + "ldr q19, [x24], #0x10\n" + "zip1 v17.8h, v28.8h, v16.8h\n" + "zip2 v16.8h, v28.8h, v16.8h\n" + "str q17, [x27, #0x0]\n" + "zip1 v17.8h, v27.8h, v18.8h\n" + "str q16, [x27, #0x10]\n" + "zip2 v16.8h, v27.8h, v18.8h\n" + "str q17, [x27, #0x20]\n" + "zip1 v17.8h, v26.8h, v20.8h\n" + "str q16, [x27, #0x30]\n" + "zip1 v16.8h, v25.8h, v17.8h\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.8h, v25.8h, v17.8h\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "zip1 v16.8h, v24.8h, v18.8h\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.8h, v24.8h, v18.8h\n" + "str q16, [x27, #0x10]\n" + "zip1 v16.8h, v22.8h, v17.8h\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.8h, v22.8h, v17.8h\n" + "str q16, [x27, #0x30]\n" + "zip2 v17.8h, v23.8h, v19.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.8h, v21.8h, v17.8h\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0xc\n" + "ldr q24, [x26], #0x10\n" + "cmp x19, #0xc\n" + "ldr q16, [x25], #0x10\n" + "zip1 v23.8h, v18.8h, v16.8h\n" + "ldr d17, [x28], #0x8\n" + "zip2 v22.8h, v18.8h, v16.8h\n" + "ldr d21, [x26], #0x8\n" + "ldr d16, [x25], #0x8\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "ldr q16, [x24], #0x10\n" + "zip1 v19.8h, v24.8h, v16.8h\n" + "ldr d18, [x24], #0x8\n" + "zip2 v17.8h, v24.8h, v16.8h\n" + "zip1 v16.8h, v23.8h, v19.8h\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "str q16, [x27, #0x10]\n" + "zip1 v16.8h, v22.8h, v17.8h\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.8h, v22.8h, v17.8h\n" + "str q16, [x27, #0x30]\n" + "zip1 v17.8h, v21.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.8h, v20.8h, v17.8h\n" + "str q16, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d18, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d17, [x26], #0x8\n" + "cmp x19, #0x4\n" + "ldr d16, [x25], #0x8\n" + "zip1 v18.8h, v18.8h, v16.8h\n" + "ldr d16, [x24], #0x8\n" + "zip1 v17.8h, v17.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x10]\n" + "add x27, x27, #0x20\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h17, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h18, [x26], #0x2\n" + "cmp x19, #0x1\n" + "ldr h16, [x25], #0x2\n" + "zip1 v17.8h, v17.8h, v16.8h\n" + "ldr h16, [x24], #0x2\n" + "zip1 v16.8h, v18.8h, v16.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<12, 4, true, VLType::None>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_2x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..efb1c742ed --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp @@ -0,0 +1,735 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 12 * roundup(height, 4) * sizeof(bfloat16); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q12, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0x18\n" + "ldr q11, [x25], #0x10\n" + "zip1 v29.4s, v12.4s, v11.4s\n" + "ldr q5, [x28], #0x10\n" + "zip2 v0.4s, v12.4s, v11.4s\n" + "ldr q28, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v23.4s, v5.4s, v17.4s\n" + "ldr q25, [x28], #0x10\n" + "zip2 v18.4s, v5.4s, v17.4s\n" + "ldr q6, [x26], #0x10\n" + "ldr q31, [x25], #0x10\n" + "zip1 v21.4s, v25.4s, v31.4s\n" + "ldr q16, [x28], #0x10\n" + "zip2 v10.4s, v25.4s, v31.4s\n" + "ldr q11, [x26], #0x10\n" + "ldr q1, [x25], #0x10\n" + "zip1 v13.4s, v16.4s, v1.4s\n" + "ldr q14, [x28], #0x10\n" + "zip2 v24.4s, v16.4s, v1.4s\n" + "ldr q4, [x26], #0x10\n" + "ldr q22, [x25], #0x10\n" + "zip1 v1.4s, v14.4s, v22.4s\n" + "ldr q15, [x28], #0x10\n" + "zip2 v8.4s, v14.4s, v22.4s\n" + "ldr q31, [x26], #0x10\n" + "ldr q3, [x25], #0x10\n" + "zip1 v27.4s, v15.4s, v3.4s\n" + "ldr q30, [x24], #0x10\n" + "zip2 v22.4s, v15.4s, v3.4s\n" + "ldr q15, [x23], #0x10\n" + "ldr q5, [x22], #0x10\n" + "zip1 v16.4s, v20.4s, v30.4s\n" + "ldr q3, [x24], #0x10\n" + "zip2 v7.4s, v20.4s, v30.4s\n" + "ldr q26, [x23], #0x10\n" + "zip1 v12.4s, v29.4s, v16.4s\n" + "ldr q25, [x22], #0x10\n" + ".inst 0x0ea16994 // bfcvtn v20.4h, v12.4s\n" + "ldr q2, [x21], #0x10\n" + "zip2 v16.4s, v29.4s, v16.4s\n" + "ldr q19, [x24], #0x10\n" + "zip1 v12.4s, v0.4s, v7.4s\n" + "ldr q9, [x23], #0x10\n" + ".inst 0x4ea16a14 // bfcvtn2 v20.8h, v16.4s\n" + "ldr q14, [x22], #0x10\n" + ".inst 0x0ea1699e // bfcvtn v30.4h, v12.4s\n" + "ldr q12, [x21], #0x10\n" + "zip2 v16.4s, v0.4s, v7.4s\n" + "ldr q7, [x24], #0x10\n" + "zip1 v29.4s, v28.4s, v3.4s\n" + "ldr q0, [x23], #0x10\n" + ".inst 0x4ea16a1e // bfcvtn2 v30.8h, v16.4s\n" + "ldr q17, [x22], #0x10\n" + "zip1 v16.4s, v23.4s, v29.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v23.4s, v23.4s, v29.4s\n" + "ldr q29, [x24], #0x10\n" + "zip2 v28.4s, v28.4s, v3.4s\n" + "ldr q3, [x23], #0x10\n" + ".inst 0x4ea16af0 // bfcvtn2 v16.8h, v23.4s\n" + "zip1 v23.4s, v18.4s, v28.4s\n" + ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" + "zip2 v28.4s, v18.4s, v28.4s\n" + "ldr q18, [x24], #0x10\n" + ".inst 0x4ea16b97 // bfcvtn2 v23.8h, v28.4s\n" + "zip1 v28.4s, v6.4s, v19.4s\n" + "zip2 v6.4s, v6.4s, v19.4s\n" + "zip1 v19.4s, v21.4s, v28.4s\n" + ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" + "zip2 v28.4s, v21.4s, v28.4s\n" + "ldr q21, [x23], #0x10\n" + ".inst 0x4ea16b93 // bfcvtn2 v19.8h, v28.4s\n" + "zip1 v28.4s, v10.4s, v6.4s\n" + ".inst 0x0ea16b9c // bfcvtn v28.4h, v28.4s\n" + "zip2 v6.4s, v10.4s, v6.4s\n" + "ldr q10, [x22], #0x10\n" + ".inst 0x4ea168dc // bfcvtn2 v28.8h, v6.4s\n" + "zip1 v6.4s, v11.4s, v7.4s\n" + "zip2 v7.4s, v11.4s, v7.4s\n" + "zip1 v11.4s, v13.4s, v6.4s\n" + ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n" + "zip2 v13.4s, v13.4s, v6.4s\n" + "ldr q6, [x22], #0x10\n" + ".inst 0x4ea169ab // bfcvtn2 v11.8h, v13.4s\n" + "zip1 v13.4s, v24.4s, v7.4s\n" + ".inst 0x0ea169ad // bfcvtn v13.4h, v13.4s\n" + "zip2 v7.4s, v24.4s, v7.4s\n" + "ldr q24, [x21], #0x10\n" + ".inst 0x4ea168ed // bfcvtn2 v13.8h, v7.4s\n" + "zip1 v7.4s, v4.4s, v29.4s\n" + "zip2 v29.4s, v4.4s, v29.4s\n" + "zip1 v4.4s, v1.4s, v7.4s\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + "zip2 v7.4s, v1.4s, v7.4s\n" + "ldr q1, [x21], #0x10\n" + ".inst 0x4ea168e4 // bfcvtn2 v4.8h, v7.4s\n" + "zip1 v7.4s, v8.4s, v29.4s\n" + ".inst 0x0ea168e7 // bfcvtn v7.4h, v7.4s\n" + "zip2 v8.4s, v8.4s, v29.4s\n" + "ldr q29, [x21], #0x10\n" + ".inst 0x4ea16907 // bfcvtn2 v7.8h, v8.4s\n" + "zip1 v8.4s, v31.4s, v18.4s\n" + "zip2 v31.4s, v31.4s, v18.4s\n" + "zip1 v18.4s, v27.4s, v8.4s\n" + ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" + "zip2 v27.4s, v27.4s, v8.4s\n" + "ldr q8, [x21], #0x10\n" + ".inst 0x4ea16b72 // bfcvtn2 v18.8h, v27.4s\n" + "zip1 v27.4s, v22.4s, v31.4s\n" + ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n" + "zip2 v31.4s, v22.4s, v31.4s\n" + "ldr q22, [x20], #0x10\n" + ".inst 0x4ea16bfb // bfcvtn2 v27.8h, v31.4s\n" + "zip1 v31.4s, v15.4s, v2.4s\n" + "zip2 v2.4s, v15.4s, v2.4s\n" + "zip1 v15.4s, v26.4s, v12.4s\n" + "zip2 v26.4s, v26.4s, v12.4s\n" + "zip1 v12.4s, v5.4s, v22.4s\n" + "zip2 v22.4s, v5.4s, v22.4s\n" + "zip1 v5.4s, v31.4s, v12.4s\n" + ".inst 0x0ea168a5 // bfcvtn v5.4h, v5.4s\n" + "zip2 v31.4s, v31.4s, v12.4s\n" + "ldr q12, [x20], #0x10\n" + ".inst 0x4ea16be5 // bfcvtn2 v5.8h, v31.4s\n" + "zip1 v31.4s, v2.4s, v22.4s\n" + ".inst 0x0ea16bff // bfcvtn v31.4h, v31.4s\n" + "zip2 v2.4s, v2.4s, v22.4s\n" + "ldr q22, [x20], #0x10\n" + ".inst 0x4ea1685f // bfcvtn2 v31.8h, v2.4s\n" + "zip1 v2.4s, v25.4s, v12.4s\n" + "zip2 v25.4s, v25.4s, v12.4s\n" + "zip1 v12.4s, v15.4s, v2.4s\n" + ".inst 0x0ea1698c // bfcvtn v12.4h, v12.4s\n" + "zip2 v15.4s, v15.4s, v2.4s\n" + "ldr q2, [x20], #0x10\n" + ".inst 0x4ea169ec // bfcvtn2 v12.8h, v15.4s\n" + "zip1 v15.4s, v26.4s, v25.4s\n" + ".inst 0x0ea169ef // bfcvtn v15.4h, v15.4s\n" + "zip2 v25.4s, v26.4s, v25.4s\n" + "ldr q26, [x20], #0x10\n" + ".inst 0x4ea16b2f // bfcvtn2 v15.8h, v25.4s\n" + "ldr q25, [x20], #0x10\n" + "str q20, [x27, #0x0]\n" + "zip1 v20.4s, v9.4s, v24.4s\n" + "zip2 v24.4s, v9.4s, v24.4s\n" + "str q30, [x27, #0x10]\n" + "zip1 v9.4s, v14.4s, v22.4s\n" + "str q16, [x27, #0x20]\n" + "zip1 v16.4s, v20.4s, v9.4s\n" + "str q23, [x27, #0x30]\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str q19, [x27, #0x40]\n" + "zip2 v9.4s, v20.4s, v9.4s\n" + "str q28, [x27, #0x50]\n" + "zip2 v22.4s, v14.4s, v22.4s\n" + "str q5, [x27, #0x60]\n" + ".inst 0x4ea16930 // bfcvtn2 v16.8h, v9.4s\n" + "str q31, [x27, #0x70]\n" + "zip1 v19.4s, v24.4s, v22.4s\n" + "str q12, [x27, #0x80]\n" + ".inst 0x0ea16a6c // bfcvtn v12.4h, v19.4s\n" + "str q15, [x27, #0x90]\n" + "zip2 v9.4s, v24.4s, v22.4s\n" + "str q16, [x27, #0xa0]\n" + "zip1 v15.4s, v0.4s, v1.4s\n" + ".inst 0x4ea1692c // bfcvtn2 v12.8h, v9.4s\n" + "str q12, [x27, #0xb0]\n" + "zip1 v20.4s, v17.4s, v2.4s\n" + "add x27, x27, %x[out_stride]\n" + "zip1 v16.4s, v15.4s, v20.4s\n" + "str q11, [x27, #0x0]\n" + "zip2 v9.4s, v15.4s, v20.4s\n" + "str q13, [x27, #0x10]\n" + ".inst 0x0ea16a0f // bfcvtn v15.4h, v16.4s\n" + "str q4, [x27, #0x20]\n" + "zip2 v14.4s, v0.4s, v1.4s\n" + "str q7, [x27, #0x30]\n" + "zip2 v31.4s, v17.4s, v2.4s\n" + "str q18, [x27, #0x40]\n" + ".inst 0x4ea1692f // bfcvtn2 v15.8h, v9.4s\n" + "str q27, [x27, #0x50]\n" + "zip1 v22.4s, v14.4s, v31.4s\n" + "str q15, [x27, #0x60]\n" + ".inst 0x0ea16ac9 // bfcvtn v9.4h, v22.4s\n" + "zip2 v11.4s, v14.4s, v31.4s\n" + "zip1 v18.4s, v3.4s, v29.4s\n" + "zip1 v27.4s, v10.4s, v26.4s\n" + ".inst 0x4ea16969 // bfcvtn2 v9.8h, v11.4s\n" + "str q9, [x27, #0x70]\n" + "zip1 v13.4s, v18.4s, v27.4s\n" + "zip2 v9.4s, v18.4s, v27.4s\n" + ".inst 0x0ea169b3 // bfcvtn v19.4h, v13.4s\n" + "zip2 v18.4s, v3.4s, v29.4s\n" + "zip2 v1.4s, v10.4s, v26.4s\n" + ".inst 0x4ea16933 // bfcvtn2 v19.8h, v9.4s\n" + "str q19, [x27, #0x80]\n" + "zip1 v16.4s, v18.4s, v1.4s\n" + "zip2 v20.4s, v18.4s, v1.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v21.4s, v8.4s\n" + "zip1 v2.4s, v6.4s, v25.4s\n" + ".inst 0x4ea16a90 // bfcvtn2 v16.8h, v20.4s\n" + "str q16, [x27, #0x90]\n" + "zip1 v16.4s, v18.4s, v2.4s\n" + "zip2 v20.4s, v18.4s, v2.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v21.4s, v8.4s\n" + "zip2 v17.4s, v6.4s, v25.4s\n" + ".inst 0x4ea16a90 // bfcvtn2 v16.8h, v20.4s\n" + "str q16, [x27, #0xa0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0xc\n" + "ldr q21, [x26], #0x10\n" + "cmp x19, #0xc\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v20.4s, v18.4s, v16.4s\n" + "ldr q8, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v7.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v6.4s, v17.4s, v16.4s\n" + "ldr q5, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v4.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v3.4s, v18.4s, v17.4s\n" + "ldr q2, [x23], #0x10\n" + "ldr q1, [x22], #0x10\n" + "zip1 v17.4s, v21.4s, v16.4s\n" + "ldr q0, [x24], #0x10\n" + "zip2 v18.4s, v21.4s, v16.4s\n" + "ldr q31, [x23], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + "ldr q30, [x22], #0x10\n" + ".inst 0x0ea16a1d // bfcvtn v29.4h, v16.4s\n" + "ldr q28, [x21], #0x10\n" + "zip2 v17.4s, v19.4s, v17.4s\n" + "ldr q27, [x24], #0x10\n" + "zip1 v16.4s, v20.4s, v18.4s\n" + "ldr q26, [x23], #0x10\n" + ".inst 0x4ea16a3d // bfcvtn2 v29.8h, v17.4s\n" + "ldr q25, [x22], #0x10\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "ldr q24, [x21], #0x10\n" + "zip2 v16.4s, v20.4s, v18.4s\n" + "ldr q23, [x20], #0x10\n" + "zip1 v17.4s, v8.4s, v0.4s\n" + "ldr q22, [x21], #0x10\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "zip1 v16.4s, v7.4s, v17.4s\n" + "ldr q21, [x20], #0x10\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "ldr q20, [x20], #0x10\n" + "zip2 v16.4s, v7.4s, v17.4s\n" + "zip2 v17.4s, v8.4s, v0.4s\n" + "str q29, [x27, #0x0]\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "str q19, [x27, #0x10]\n" + "zip1 v16.4s, v6.4s, v17.4s\n" + "str q18, [x27, #0x20]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v6.4s, v17.4s\n" + "zip1 v17.4s, v5.4s, v27.4s\n" + "zip1 v16.4s, v4.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x30]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v4.4s, v17.4s\n" + "zip2 v17.4s, v5.4s, v27.4s\n" + "zip1 v16.4s, v3.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x40]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v3.4s, v17.4s\n" + "zip1 v18.4s, v2.4s, v28.4s\n" + "zip1 v17.4s, v1.4s, v23.4s\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "str q19, [x27, #0x50]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v2.4s, v28.4s\n" + "zip2 v17.4s, v1.4s, v23.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x60]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v31.4s, v24.4s\n" + "zip1 v17.4s, v30.4s, v21.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x70]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v31.4s, v24.4s\n" + "zip2 v17.4s, v30.4s, v21.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x80]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v26.4s, v22.4s\n" + "zip1 v17.4s, v25.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x90]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v26.4s, v22.4s\n" + "zip2 v17.4s, v25.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xa0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr q20, [x28], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x26], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x25], #0x10\n" + "zip1 v19.4s, v20.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v25.4s, v20.4s, v17.4s\n" + "ldr q24, [x23], #0x10\n" + "ldr q23, [x22], #0x10\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "ldr q22, [x21], #0x10\n" + "zip2 v21.4s, v18.4s, v16.4s\n" + "ldr q20, [x20], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v19.4s, v17.4s\n" + "zip1 v16.4s, v25.4s, v21.4s\n" + ".inst 0x4ea16a32 // bfcvtn2 v18.8h, v17.4s\n" + "str q18, [x27, #0x0]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v25.4s, v21.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v20.4s\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "str q19, [x27, #0x10]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x60]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, #0x20\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr s18, [x28], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s17, [x26], #0x4\n" + "cmp x19, #0x1\n" + "ldr s16, [x25], #0x4\n" + "zip1 v18.4s, v18.4s, v16.4s\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + "ldr s19, [x22], #0x4\n" + "ldr s17, [x21], #0x4\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + "ldr s18, [x20], #0x4\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v17.4s, v20.4s, v17.4s\n" + "str d16, [x27, #0x0]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d16, [x27, #0x60]\n" + "add x27, x27, #0x8\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0xc0\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0x18\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v9.4s, v17.4s, v16.4s\n" + "ldr q8, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v7.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v6.4s, v18.4s, v16.4s\n" + "ldr q5, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v4.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v3.4s, v17.4s, v16.4s\n" + "ldr q2, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v1.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v0.4s, v18.4s, v16.4s\n" + "ldr q31, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v30.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v29.4s, v17.4s, v16.4s\n" + "ldr q28, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v27.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v26.4s, v18.4s, v17.4s\n" + "ldr q25, [x24], #0x10\n" + "zip1 v17.4s, v20.4s, v16.4s\n" + "zip2 v24.4s, v20.4s, v16.4s\n" + "ldr q23, [x24], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + "zip2 v17.4s, v19.4s, v17.4s\n" + "ldr q22, [x24], #0x10\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip1 v16.4s, v9.4s, v24.4s\n" + "ldr q21, [x24], #0x10\n" + ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "ldr q20, [x24], #0x10\n" + "zip2 v16.4s, v9.4s, v24.4s\n" + "zip1 v17.4s, v8.4s, v25.4s\n" + "str q19, [x27, #0x0]\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "str q18, [x27, #0x10]\n" + "zip1 v16.4s, v7.4s, v17.4s\n" + "zip2 v19.4s, v7.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v8.4s, v25.4s\n" + "zip1 v16.4s, v6.4s, v17.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q18, [x27, #0x20]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v6.4s, v17.4s\n" + "zip1 v17.4s, v5.4s, v23.4s\n" + "zip1 v16.4s, v4.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x30]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v4.4s, v17.4s\n" + "zip2 v17.4s, v5.4s, v23.4s\n" + "zip1 v16.4s, v3.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x40]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v3.4s, v17.4s\n" + "zip1 v17.4s, v2.4s, v22.4s\n" + "zip1 v16.4s, v1.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x50]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v18.4s, v1.4s, v17.4s\n" + "zip2 v17.4s, v2.4s, v22.4s\n" + "zip1 v16.4s, v0.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x0]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v0.4s, v17.4s\n" + "zip1 v17.4s, v31.4s, v21.4s\n" + "zip1 v16.4s, v30.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x10]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v30.4s, v17.4s\n" + "zip2 v17.4s, v31.4s, v21.4s\n" + "zip1 v16.4s, v29.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x20]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v29.4s, v17.4s\n" + "zip1 v17.4s, v28.4s, v20.4s\n" + "zip1 v16.4s, v27.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x30]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v17.4s, v27.4s, v17.4s\n" + "zip2 v18.4s, v28.4s, v20.4s\n" + "zip1 v16.4s, v26.4s, v18.4s\n" + ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n" + "str q19, [x27, #0x40]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v26.4s, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0xc\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0xc\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v29.4s, v18.4s, v16.4s\n" + "ldr q28, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v27.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v26.4s, v17.4s, v16.4s\n" + "ldr q25, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v24.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v23.4s, v18.4s, v17.4s\n" + "ldr q22, [x24], #0x10\n" + "zip1 v17.4s, v20.4s, v16.4s\n" + "zip2 v21.4s, v20.4s, v16.4s\n" + "ldr q20, [x24], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + "zip2 v19.4s, v19.4s, v17.4s\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip1 v16.4s, v29.4s, v21.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + ".inst 0x4ea16a71 // bfcvtn2 v17.8h, v19.4s\n" + "str q17, [x27, #0x0]\n" + "zip2 v16.4s, v29.4s, v21.4s\n" + "zip1 v17.4s, v28.4s, v22.4s\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "str q18, [x27, #0x10]\n" + "zip1 v16.4s, v27.4s, v17.4s\n" + "zip2 v19.4s, v27.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v28.4s, v22.4s\n" + "zip1 v16.4s, v26.4s, v17.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q18, [x27, #0x20]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v26.4s, v17.4s\n" + "zip1 v17.4s, v25.4s, v20.4s\n" + "zip1 v16.4s, v24.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x30]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v17.4s, v24.4s, v17.4s\n" + "zip2 v18.4s, v25.4s, v20.4s\n" + "zip1 v16.4s, v23.4s, v18.4s\n" + ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n" + "str q19, [x27, #0x40]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v23.4s, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0x50]\n" + "add x27, x27, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr q19, [x28], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x26], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x25], #0x10\n" + "zip1 v21.4s, v19.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v20.4s, v19.4s, v17.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "zip2 v19.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v21.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v21.4s, v17.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + ".inst 0x4ea16a32 // bfcvtn2 v18.8h, v17.4s\n" + "str q18, [x27, #0x0]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0x10]\n" + "add x27, x27, #0x20\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr s17, [x28], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s18, [x26], #0x4\n" + "cmp x19, #0x1\n" + "ldr s16, [x25], #0x4\n" + "zip1 v17.4s, v17.4s, v16.4s\n" + "ldr s16, [x24], #0x4\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace +template<> +void Transform<12, 4, true, VLType::None>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp new file mode 100644 index 0000000000..7359eea737 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 12 * height * sizeof(int16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q16, [x24], #0x10\n" + "sshll v27.8h, v16.8b, #0x0\n" + "ldr d17, [x24], #0x8\n" + "sub x19, x19, #0x18\n" + "sshll2 v16.8h, v16.16b, #0x0\n" + "ldr q26, [x22], #0x10\n" + "cmp x19, #0x18\n" + "dup v20.2d, v16.d[0]\n" + "ldr q25, [x21], #0x10\n" + "dup v24.2d, v16.d[1]\n" + "ldr q23, [x20], #0x10\n" + "sshll v16.8h, v17.8b, #0x0\n" + "ldr d19, [x22], #0x8\n" + "mov v24.d[1], v16.d[0]\n" + "dup v22.2d, v16.d[1]\n" + "ldr d18, [x21], #0x8\n" + "sshll v16.8h, v26.8b, #0x0\n" + "ldr d21, [x20], #0x8\n" + "mov v20.d[1], v16.d[0]\n" + "str q27, [x23, #0x0]\n" + "dup v17.2d, v16.d[1]\n" + "str q20, [x23, #0x10]\n" + "sshll2 v16.8h, v26.16b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x20]\n" + "mov v22.d[1], v16.d[1]\n" + "sshll v20.8h, v19.8b, #0x0\n" + "sshll v16.8h, v25.8b, #0x0\n" + "str q16, [x23, #0x30]\n" + "sshll2 v16.8h, v25.16b, #0x0\n" + "dup v17.2d, v16.d[0]\n" + "dup v19.2d, v16.d[1]\n" + "sshll v16.8h, v18.8b, #0x0\n" + "mov v19.d[1], v16.d[0]\n" + "dup v18.2d, v16.d[1]\n" + "sshll v16.8h, v23.8b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x40]\n" + "dup v17.2d, v16.d[1]\n" + "sshll2 v16.8h, v23.16b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "mov v18.d[1], v16.d[1]\n" + "str q24, [x23, #0x0]\n" + "sshll v16.8h, v21.8b, #0x0\n" + "str q22, [x23, #0x10]\n" + "str q20, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q18, [x23, #0x40]\n" + "str q16, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "ldr d21, [x22], #0x8\n" + "cmp x19, #0xc\n" + "ldr d20, [x21], #0x8\n" + "ldr d19, [x20], #0x8\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "sshll v17.8h, v16.8b, #0x0\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "sshll2 v18.8h, v16.16b, #0x0\n" + "ld1 { v20.s }[2], [x21], #0x4\n" + "ld1 { v19.s }[2], [x20], #0x4\n" + "sshll v16.8h, v21.8b, #0x0\n" + "str q17, [x23, #0x0]\n" + "sshll2 v17.8h, v21.16b, #0x0\n" + "mov v18.d[1], v16.d[0]\n" + "str q18, [x23, #0x10]\n" + "dup v16.2d, v16.d[1]\n" + "mov v16.d[1], v17.d[0]\n" + "str q16, [x23, #0x20]\n" + "sshll v16.8h, v20.8b, #0x0\n" + "str q16, [x23, #0x30]\n" + "sshll2 v17.8h, v20.16b, #0x0\n" + "sshll v16.8h, v19.8b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x40]\n" + "dup v17.2d, v16.d[1]\n" + "sshll2 v16.8h, v19.16b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr s16, [x24], #0x4\n" + "sshll v19.8h, v16.8b, #0x0\n" + "ldr s16, [x22], #0x4\n" + "sub x19, x19, #0x4\n" + "sshll v18.8h, v16.8b, #0x0\n" + "ldr s16, [x21], #0x4\n" + "cmp x19, #0x4\n" + "sshll v17.8h, v16.8b, #0x0\n" + "ldr s16, [x20], #0x4\n" + "str d19, [x23, #0x0]\n" + "sshll v16.8h, v16.8b, #0x0\n" + "str d18, [x23, #0x18]\n" + "str d17, [x23, #0x30]\n" + "str d16, [x23, #0x48]\n" + "add x23, x23, #0x8\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr b16, [x24], #0x1\n" + "sshll v19.8h, v16.8b, #0x0\n" + "ldr b16, [x22], #0x1\n" + "sub x19, x19, #0x1\n" + "sshll v18.8h, v16.8b, #0x0\n" + "ldr b16, [x21], #0x1\n" + "cmp x19, #0x1\n" + "sshll v17.8h, v16.8b, #0x0\n" + "ldr b16, [x20], #0x1\n" + "str h19, [x23, #0x0]\n" + "sshll v16.8h, v16.8b, #0x0\n" + "str h18, [x23, #0x18]\n" + "str h17, [x23, #0x30]\n" + "str h16, [x23, #0x48]\n" + "add x23, x23, #0x2\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q17, [x24], #0x10\n" + "sshll v16.8h, v17.8b, #0x0\n" + "ldr d18, [x24], #0x8\n" + "sub x19, x19, #0x18\n" + "sshll2 v17.8h, v17.16b, #0x0\n" + "str q16, [x23, #0x0]\n" + "cmp x19, #0x18\n" + "dup v16.2d, v17.d[0]\n" + "str d16, [x23, #0x10]\n" + "dup v17.2d, v17.d[1]\n" + "add x23, x23, %x[out_stride]\n" + "sshll v16.8h, v18.8b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x0]\n" + "dup v16.2d, v16.d[1]\n" + "str d16, [x23, #0x10]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr d17, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "cmp x19, #0xc\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "sshll v16.8h, v17.8b, #0x0\n" + "str q16, [x23, #0x0]\n" + "sshll2 v16.8h, v17.16b, #0x0\n" + "str d16, [x23, #0x10]\n" + "add x23, x23, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr s16, [x24], #0x4\n" + "sshll v16.8h, v16.8b, #0x0\n" + "str d16, [x23, #0x0]\n" + "sub x19, x19, #0x4\n" + "add x23, x23, #0x8\n" + "cmp x19, #0x4\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr b16, [x24], #0x1\n" + "sshll v16.8h, v16.8b, #0x0\n" + "str h16, [x23, #0x0]\n" + "sub x19, x19, #0x1\n" + "add x23, x23, #0x2\n" + "cmp x19, #0x1\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x18\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace +template<> +void Transform<12, 1, true, VLType::None>( + int16_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_s8s16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp new file mode 100644 index 0000000000..34fb0ed5ac --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 12 * height * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q16, [x24], #0x10\n" + "ushll v27.8h, v16.8b, #0x0\n" + "ldr d17, [x24], #0x8\n" + "sub x19, x19, #0x18\n" + "ushll2 v16.8h, v16.16b, #0x0\n" + "ldr q26, [x22], #0x10\n" + "cmp x19, #0x18\n" + "dup v20.2d, v16.d[0]\n" + "ldr q25, [x21], #0x10\n" + "dup v24.2d, v16.d[1]\n" + "ldr q23, [x20], #0x10\n" + "ushll v16.8h, v17.8b, #0x0\n" + "ldr d19, [x22], #0x8\n" + "mov v24.d[1], v16.d[0]\n" + "dup v22.2d, v16.d[1]\n" + "ldr d18, [x21], #0x8\n" + "ushll v16.8h, v26.8b, #0x0\n" + "ldr d21, [x20], #0x8\n" + "mov v20.d[1], v16.d[0]\n" + "str q27, [x23, #0x0]\n" + "dup v17.2d, v16.d[1]\n" + "str q20, [x23, #0x10]\n" + "ushll2 v16.8h, v26.16b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x20]\n" + "mov v22.d[1], v16.d[1]\n" + "ushll v20.8h, v19.8b, #0x0\n" + "ushll v16.8h, v25.8b, #0x0\n" + "str q16, [x23, #0x30]\n" + "ushll2 v16.8h, v25.16b, #0x0\n" + "dup v17.2d, v16.d[0]\n" + "dup v19.2d, v16.d[1]\n" + "ushll v16.8h, v18.8b, #0x0\n" + "mov v19.d[1], v16.d[0]\n" + "dup v18.2d, v16.d[1]\n" + "ushll v16.8h, v23.8b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x40]\n" + "dup v17.2d, v16.d[1]\n" + "ushll2 v16.8h, v23.16b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "mov v18.d[1], v16.d[1]\n" + "str q24, [x23, #0x0]\n" + "ushll v16.8h, v21.8b, #0x0\n" + "str q22, [x23, #0x10]\n" + "str q20, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q18, [x23, #0x40]\n" + "str q16, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "ldr d21, [x22], #0x8\n" + "cmp x19, #0xc\n" + "ldr d20, [x21], #0x8\n" + "ldr d19, [x20], #0x8\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "ushll v17.8h, v16.8b, #0x0\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ushll2 v18.8h, v16.16b, #0x0\n" + "ld1 { v20.s }[2], [x21], #0x4\n" + "ld1 { v19.s }[2], [x20], #0x4\n" + "ushll v16.8h, v21.8b, #0x0\n" + "str q17, [x23, #0x0]\n" + "ushll2 v17.8h, v21.16b, #0x0\n" + "mov v18.d[1], v16.d[0]\n" + "str q18, [x23, #0x10]\n" + "dup v16.2d, v16.d[1]\n" + "mov v16.d[1], v17.d[0]\n" + "str q16, [x23, #0x20]\n" + "ushll v16.8h, v20.8b, #0x0\n" + "str q16, [x23, #0x30]\n" + "ushll2 v17.8h, v20.16b, #0x0\n" + "ushll v16.8h, v19.8b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x40]\n" + "dup v17.2d, v16.d[1]\n" + "ushll2 v16.8h, v19.16b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr s16, [x24], #0x4\n" + "ushll v19.8h, v16.8b, #0x0\n" + "ldr s16, [x22], #0x4\n" + "sub x19, x19, #0x4\n" + "ushll v18.8h, v16.8b, #0x0\n" + "ldr s16, [x21], #0x4\n" + "cmp x19, #0x4\n" + "ushll v17.8h, v16.8b, #0x0\n" + "ldr s16, [x20], #0x4\n" + "str d19, [x23, #0x0]\n" + "ushll v16.8h, v16.8b, #0x0\n" + "str d18, [x23, #0x18]\n" + "str d17, [x23, #0x30]\n" + "str d16, [x23, #0x48]\n" + "add x23, x23, #0x8\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr b16, [x24], #0x1\n" + "ushll v19.8h, v16.8b, #0x0\n" + "ldr b16, [x22], #0x1\n" + "sub x19, x19, #0x1\n" + "ushll v18.8h, v16.8b, #0x0\n" + "ldr b16, [x21], #0x1\n" + "cmp x19, #0x1\n" + "ushll v17.8h, v16.8b, #0x0\n" + "ldr b16, [x20], #0x1\n" + "str h19, [x23, #0x0]\n" + "ushll v16.8h, v16.8b, #0x0\n" + "str h18, [x23, #0x18]\n" + "str h17, [x23, #0x30]\n" + "str h16, [x23, #0x48]\n" + "add x23, x23, #0x2\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q17, [x24], #0x10\n" + "ushll v16.8h, v17.8b, #0x0\n" + "ldr d18, [x24], #0x8\n" + "sub x19, x19, #0x18\n" + "ushll2 v17.8h, v17.16b, #0x0\n" + "str q16, [x23, #0x0]\n" + "cmp x19, #0x18\n" + "dup v16.2d, v17.d[0]\n" + "str d16, [x23, #0x10]\n" + "dup v17.2d, v17.d[1]\n" + "add x23, x23, %x[out_stride]\n" + "ushll v16.8h, v18.8b, #0x0\n" + "mov v17.d[1], v16.d[0]\n" + "str q17, [x23, #0x0]\n" + "dup v16.2d, v16.d[1]\n" + "str d16, [x23, #0x10]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr d17, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "cmp x19, #0xc\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "ushll v16.8h, v17.8b, #0x0\n" + "str q16, [x23, #0x0]\n" + "ushll2 v16.8h, v17.16b, #0x0\n" + "str d16, [x23, #0x10]\n" + "add x23, x23, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr s16, [x24], #0x4\n" + "ushll v16.8h, v16.8b, #0x0\n" + "str d16, [x23, #0x0]\n" + "sub x19, x19, #0x4\n" + "add x23, x23, #0x8\n" + "cmp x19, #0x4\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr b16, [x24], #0x1\n" + "ushll v16.8h, v16.8b, #0x0\n" + "str h16, [x23, #0x0]\n" + "sub x19, x19, #0x1\n" + "add x23, x23, #0x2\n" + "cmp x19, #0x1\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x18\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace +template<> +void Transform<12, 1, true, VLType::None>( + uint16_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_12_u8u16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp deleted file mode 100644 index f6233ef503..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 6x32-bit sized specialisation -template <> -template -inline void TransformImpl<6, 1, true, 4, 4, VLType::None>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 12 x uint16_t specialisation - TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 12x16-bit sized specialisation -template <> -template -inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 12 x uint16_t version -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR d1, [%[in0], #0x10]\n" - "STR d1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" - ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r" (in0), - [out] "+r" (out) - : - : "v0", "v1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "LDR d1, [%[in0], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" - ASM_PREFETCH("[%[in0], #192]") - - "LDR x21, [%[in1]]\n" - "LDR q2, [%[in1], #0x08]\n" - "INS v1.d[1], x21\n" - "ADD %x[in1], %x[in1], #0x18\n" - "STP q0, q1, [%[out]]\n" - "STR q2, [%x[out], #0x20]\n" - ASM_PREFETCH("[%[in1], #192]") - : [in0] "+r" (in0), - [in1] "+r" (in1), - [out] "+r" (out) - : - : "x21", "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "LDR q0, [%x[in0]], #0x10\n" - "STR q0, [%x[out]]\n" - "LDR d1, [%x[in0]], #0x08\n" - ASM_PREFETCH("[%[in0], #192]") - "STR d1, [%x[out], #0x10]\n" - - "LDR q0, [%x[in1]], #0x10\n" - "STR q0, [%x[out], #0x18]\n" - "LDR d1, [%x[in1]], #0x08\n" - ASM_PREFETCH("[%[in1], #192]") - "STR d1, [%x[out], #0x28]\n" - - "LDR q0, [%x[in2]], #0x10\n" - "STR q0, [%x[out], #0x30]\n" - "LDR d1, [%x[in2]], #0x08\n" - ASM_PREFETCH("[%[in2], #192]") - "STR d1, [%x[out], #0x40]\n" - - "LDR q0, [%x[in3]], #0x10\n" - "STR q0, [%x[out], #0x48]\n" - "LDR d1, [%x[in3]], #0x08\n" - ASM_PREFETCH("[%[in3], #192]") - "STR d1, [%x[out], #0x58]\n" - : [in0] "+r" (in0), - [in1] "+r" (in1), - [in2] "+r" (in2), - [in3] "+r" (in3), - [out] "+r" (out) - : - : "v0", "v1", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp deleted file mode 100644 index c0f3e17d31..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) - -#include "transpose_interleave_common.hpp" - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" - "STR q2, [%[out], #32]\n" - : [in0] "+r" (in0), [out] "+r" (out) - : - : "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" - "STP q2, q3, [%[out], #32]\n" - ASM_PREFETCH("[%[in1], #192]") - "LDR d5, [%[in1]], #8\n" - "FCVTL v5.4s, v5.4h\n" - "STP q4, q5, [%[out], #64]\n" - : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - "LDR d2, [%[in0]], #8\n" - ASM_PREFETCH("[%[in0], #192]") - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" - "STP q2, q3, [%[out], #32]\n" - "LDR d5, [%[in1]], #8\n" - "FCVTL v5.4s, v5.4h\n" - ASM_PREFETCH("[%[in1], #192]") - "STP q4, q5, [%[out], #64]\n" - "LDR q6, [%[in2]], #16\n" - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" - "STP q6, q7, [%[out], #96]\n" - "LDR d8, [%[in2]], #8\n" - "FCVTL v8.4s, v8.4h\n" - ASM_PREFETCH("[%[in2], #192]") - "LDR q9, [%[in3]], #16\n" - "FCVTL2 v10.4s, v9.8h\n" - "FCVTL v9.4s, v9.4h\n" - "STP q8, q9, [%[out], #128]\n" - "LDR d11, [%[in3]], #8\n" - "FCVTL v11.4s, v11.4h\n" - "STP q10, q11, [%[out], #160]\n" - ASM_PREFETCH("[%[in3], #192]") - - : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<12, 1, true, 4, 2, VLType::None>::Transform( - float* out, const __fp16* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __aarch64__ && __ARM_FP16_ARGS diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp new file mode 100644 index 0000000000..dd1bd508ef --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_16(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 4 * height * sizeof(uint32_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x4\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q19, [x24], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x22], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q19, [x23, #0x0]\n" + "str q18, [x23, #0x10]\n" + "str q17, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x1\n" + "blt 5f\n" + "4:" // Main row loop: width 1 loop: loop + "ldr s19, [x24], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s18, [x22], #0x4\n" + "cmp x19, #0x1\n" + "ldr s17, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "str s19, [x23, #0x0]\n" + "str s18, [x23, #0x10]\n" + "str s17, [x23, #0x20]\n" + "str s16, [x23, #0x30]\n" + "add x23, x23, #0x4\n" + "bge 4b\n" + "5:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x40\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x4\n" + "blt 9f\n" + "8:" // Tail row loop: Column loop + "ldr q16, [x24], #0x10\n" + "sub x19, x19, #0x4\n" + "cmp x19, #0x4\n" + "str q16, [x23, #0x0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Column loop skip + "cmp x19, #0x1\n" + "blt 11f\n" + "10:" // Tail row loop: width 1 loop: loop + "ldr s16, [x24], #0x4\n" + "sub x19, x19, #0x1\n" + "cmp x19, #0x1\n" + "str s16, [x23, #0x0]\n" + "add x23, x23, #0x4\n" + "bge 10b\n" + "11:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x10\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace + +template<> +void Transform<4, 1, true, VLType::None>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 4, + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp new file mode 100644 index 0000000000..7e7fcf5b8b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_16_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 16 * roundup(height, 4) * sizeof(uint8_t); + + __asm__ __volatile__( + "cmp %x[height], #0x10\n" + "blt 8f\n" + "1:" // Main row loop: Head + "mov x16, %x[in]\n" + "mov x15, %x[out]\n" + "add x14, x16, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "add x11, x12, %x[in_stride]\n" + "add x10, x11, %x[in_stride]\n" + "add x9, x10, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x10\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q18, [x16], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q20, [x14], #0x10\n" + "cmp x19, #0x10\n" + "ldr q17, [x13], #0x10\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr q16, [x12], #0x10\n" + "zip2 v18.16b, v18.16b, v17.16b\n" + "ldr q3, [x11], #0x10\n" + "ldr q2, [x10], #0x10\n" + "zip1 v17.16b, v20.16b, v16.16b\n" + "ldr q1, [x9], #0x10\n" + "zip2 v16.16b, v20.16b, v16.16b\n" + "ldr q0, [x28], #0x10\n" + "zip1 v31.16b, v19.16b, v17.16b\n" + "ldr q30, [x27], #0x10\n" + "zip2 v20.16b, v19.16b, v17.16b\n" + "ldr q29, [x26], #0x10\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr q28, [x25], #0x10\n" + "zip2 v18.16b, v18.16b, v16.16b\n" + "ldr q27, [x24], #0x10\n" + "zip1 v17.16b, v3.16b, v1.16b\n" + "ldr q26, [x23], #0x10\n" + "zip1 v16.16b, v2.16b, v0.16b\n" + "ldr q25, [x22], #0x10\n" + "zip1 v24.16b, v17.16b, v16.16b\n" + "ldr q23, [x21], #0x10\n" + "zip2 v22.16b, v17.16b, v16.16b\n" + "ldr q21, [x20], #0x10\n" + "zip2 v17.16b, v3.16b, v1.16b\n" + "str q31, [x15, #0x0]\n" + "zip2 v16.16b, v2.16b, v0.16b\n" + "str q20, [x15, #0x10]\n" + "zip1 v20.16b, v17.16b, v16.16b\n" + "str q19, [x15, #0x20]\n" + "zip2 v19.16b, v17.16b, v16.16b\n" + "str q18, [x15, #0x30]\n" + "zip1 v18.16b, v30.16b, v28.16b\n" + "str q24, [x15, #0x40]\n" + "zip1 v16.16b, v29.16b, v27.16b\n" + "str q22, [x15, #0x50]\n" + "zip1 v17.16b, v18.16b, v16.16b\n" + "str q20, [x15, #0x60]\n" + "zip2 v16.16b, v18.16b, v16.16b\n" + "str q19, [x15, #0x70]\n" + "zip2 v18.16b, v30.16b, v28.16b\n" + "str q17, [x15, #0x80]\n" + "zip2 v17.16b, v29.16b, v27.16b\n" + "str q16, [x15, #0x90]\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0xa0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0xb0]\n" + "zip1 v18.16b, v26.16b, v23.16b\n" + "zip1 v17.16b, v25.16b, v21.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0xc0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0xd0]\n" + "zip2 v18.16b, v26.16b, v23.16b\n" + "zip2 v17.16b, v25.16b, v21.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0xe0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0xf0]\n" + "add x15, x15, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 5f\n" + "4:" // Main row loop: width 4 loop: loop + "ldr s18, [x16], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s17, [x14], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x13], #0x4\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr s16, [x12], #0x4\n" + "ldr s18, [x11], #0x4\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr s20, [x10], #0x4\n" + "ldr s17, [x9], #0x4\n" + "zip1 v23.16b, v19.16b, v16.16b\n" + "ldr s16, [x28], #0x4\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr s18, [x27], #0x4\n" + "ldr s22, [x26], #0x4\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "ldr s17, [x25], #0x4\n" + "zip1 v21.16b, v19.16b, v16.16b\n" + "ldr s16, [x24], #0x4\n" + "zip1 v18.16b, v18.16b, v17.16b\n" + "ldr s20, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "ldr s17, [x21], #0x4\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr s16, [x20], #0x4\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str q23, [x15, #0x0]\n" + "str q21, [x15, #0x40]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "str q18, [x15, #0x80]\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x15, #0xc0]\n" + "add x15, x15, #0x10\n" + "bge 4b\n" + "5:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 7f\n" + "6:" // Main row loop: width 1 loop: loop + "ldr b18, [x16], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b17, [x14], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x13], #0x1\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr b16, [x12], #0x1\n" + "ldr b18, [x11], #0x1\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr b20, [x10], #0x1\n" + "ldr b17, [x9], #0x1\n" + "zip1 v23.16b, v19.16b, v16.16b\n" + "ldr b16, [x28], #0x1\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr b18, [x27], #0x1\n" + "ldr b22, [x26], #0x1\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "ldr b17, [x25], #0x1\n" + "zip1 v21.16b, v19.16b, v16.16b\n" + "ldr b16, [x24], #0x1\n" + "zip1 v18.16b, v18.16b, v17.16b\n" + "ldr b20, [x23], #0x1\n" + "ldr b19, [x22], #0x1\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "ldr b17, [x21], #0x1\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr b16, [x20], #0x1\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str s23, [x15, #0x0]\n" + "str s21, [x15, #0x40]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "str s18, [x15, #0x80]\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x15, #0xc0]\n" + "add x15, x15, #0x4\n" + "bge 6b\n" + "7:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x100\n" + "cmp %x[height], #0x10\n" + "bge 1b\n" + "cbz %x[height], 16f\n" + "8:" // Main loop skip + + "9:" // Tail row loop: Head + "mov x16, %x[in]\n" + "mov x15, %x[out]\n" + "add x14, x16, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "add %x[in], x12, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x12, x12, %x[pad_row], GT\n" + "csel x13, x13, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x14, x14, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 11f\n" + "10:" // Tail row loop: Column loop + "ldr q18, [x16], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q21, [x14], #0x10\n" + "cmp x19, #0x10\n" + "ldr q17, [x13], #0x10\n" + "zip1 v20.16b, v18.16b, v17.16b\n" + "ldr q16, [x12], #0x10\n" + "zip2 v19.16b, v18.16b, v17.16b\n" + "zip1 v18.16b, v21.16b, v16.16b\n" + "zip2 v17.16b, v21.16b, v16.16b\n" + "zip1 v16.16b, v20.16b, v18.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v20.16b, v18.16b\n" + "str q16, [x15, #0x10]\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "bge 10b\n" + "11:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 13f\n" + "12:" // Tail row loop: width 4 loop: loop + "ldr s17, [x16], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s18, [x14], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x13], #0x4\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr s16, [x12], #0x4\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "bge 12b\n" + "13:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 15f\n" + "14:" // Tail row loop: width 1 loop: loop + "ldr b17, [x16], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b18, [x14], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x13], #0x1\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr b16, [x12], #0x1\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x15, #0x0]\n" + "add x15, x15, #0x4\n" + "bge 14b\n" + "15:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x40\n" + "cmp %x[height], #0x1\n" + "bge 9b\n" + "16:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<16, 4, true, VLType::None>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<16, 4, true, VLType::None>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp new file mode 100644 index 0000000000..f52fbbae4d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 8) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 16 * roundup(height, 8) * sizeof(uint8_t); + + __asm__ __volatile__( + + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "cmp %x[height], #0x7\n" + "csel x20, x20, %x[pad_row], GT\n" + "csel x21, x21, %x[pad_row], GE\n" + "cmp %x[height], #0x5\n" + "csel x22, x22, %x[pad_row], GT\n" + "csel x23, x23, %x[pad_row], GE\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x20\n" + "ldr q19, [x26], #0x10\n" + "cmp x19, #0x20\n" + "ldr q4, [x25], #0x10\n" + "ldr q3, [x24], #0x10\n" + "ldr q16, [x23], #0x10\n" + "zip1 v2.16b, v17.16b, v16.16b\n" + "ldr q18, [x28], #0x10\n" + "zip2 v1.16b, v17.16b, v16.16b\n" + "ldr q0, [x26], #0x10\n" + "ldr q31, [x25], #0x10\n" + "ldr q30, [x24], #0x10\n" + "ldr q17, [x23], #0x10\n" + "zip1 v29.16b, v18.16b, v17.16b\n" + "ldr q16, [x22], #0x10\n" + "zip2 v28.16b, v18.16b, v17.16b\n" + "ldr q27, [x21], #0x10\n" + "ldr q26, [x20], #0x10\n" + "zip1 v25.16b, v19.16b, v16.16b\n" + "ldr q24, [x22], #0x10\n" + "zip2 v21.16b, v19.16b, v16.16b\n" + "ldr q23, [x21], #0x10\n" + "zip1 v20.16b, v4.16b, v27.16b\n" + "ldr q22, [x20], #0x10\n" + "zip1 v18.16b, v2.16b, v20.16b\n" + "zip1 v19.16b, v3.16b, v26.16b\n" + "zip1 v17.16b, v25.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v18.16b, v2.16b, v20.16b\n" + "zip2 v17.16b, v25.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip2 v20.16b, v4.16b, v27.16b\n" + "zip1 v18.16b, v1.16b, v20.16b\n" + "zip2 v19.16b, v3.16b, v26.16b\n" + "zip1 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "zip2 v18.16b, v1.16b, v20.16b\n" + "zip2 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x60]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, %x[out_stride]\n" + "zip1 v21.16b, v31.16b, v23.16b\n" + "zip1 v20.16b, v0.16b, v24.16b\n" + "zip1 v18.16b, v29.16b, v21.16b\n" + "zip1 v19.16b, v30.16b, v22.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v18.16b, v29.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip2 v21.16b, v31.16b, v23.16b\n" + "zip1 v18.16b, v28.16b, v21.16b\n" + "zip2 v20.16b, v0.16b, v24.16b\n" + "zip2 v19.16b, v30.16b, v22.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "zip2 v18.16b, v28.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x60]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q19, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q18, [x26], #0x10\n" + "cmp x19, #0x10\n" + "ldr q28, [x25], #0x10\n" + "ldr q27, [x24], #0x10\n" + "ldr q17, [x23], #0x10\n" + "zip1 v26.16b, v19.16b, v17.16b\n" + "ldr q16, [x22], #0x10\n" + "zip2 v25.16b, v19.16b, v17.16b\n" + "ldr q24, [x21], #0x10\n" + "ldr q23, [x20], #0x10\n" + "zip1 v22.16b, v18.16b, v16.16b\n" + "zip2 v21.16b, v18.16b, v16.16b\n" + "zip1 v20.16b, v28.16b, v24.16b\n" + "zip1 v18.16b, v26.16b, v20.16b\n" + "zip1 v19.16b, v27.16b, v23.16b\n" + "zip1 v17.16b, v22.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "zip2 v18.16b, v26.16b, v20.16b\n" + "zip2 v17.16b, v22.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x30]\n" + "zip2 v20.16b, v28.16b, v24.16b\n" + "zip1 v18.16b, v25.16b, v20.16b\n" + "zip2 v19.16b, v27.16b, v23.16b\n" + "zip1 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x50]\n" + "zip2 v18.16b, v25.16b, v20.16b\n" + "zip2 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x60]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr s17, [x28], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s21, [x26], #0x4\n" + "cmp x19, #0x4\n" + "ldr s18, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "ldr s16, [x23], #0x4\n" + "zip1 v19.16b, v17.16b, v16.16b\n" + "ldr s17, [x22], #0x4\n" + "ldr s16, [x21], #0x4\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr s16, [x20], #0x4\n" + "zip1 v17.16b, v21.16b, v17.16b\n" + "zip1 v18.16b, v19.16b, v18.16b\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x27, #0x10]\n" + "add x27, x27, #0x20\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr b18, [x28], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b21, [x26], #0x1\n" + "cmp x19, #0x1\n" + "ldr b17, [x25], #0x1\n" + "ldr b20, [x24], #0x1\n" + "ldr b16, [x23], #0x1\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr b18, [x22], #0x1\n" + "ldr b16, [x21], #0x1\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr b16, [x20], #0x1\n" + "zip1 v18.16b, v21.16b, v18.16b\n" + "zip1 v17.16b, v19.16b, v17.16b\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x80\n" + "cmp %x[height], #0x1\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<16, 8, true, VLType::None>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<16, 8, true, VLType::None>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp new file mode 100644 index 0000000000..cfac12a84a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 16 * roundup(height, 2) * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 8f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q16, [x26], #0x10\n" + "zip1 v1.8h, v18.8h, v16.8h\n" + "ldr q17, [x28], #0x10\n" + "cmp x19, #0x10\n" + "zip2 v0.8h, v18.8h, v16.8h\n" + "ldr q16, [x26], #0x10\n" + "ldr q19, [x25], #0x10\n" + "zip1 v31.8h, v17.8h, v16.8h\n" + "ldr q18, [x25], #0x10\n" + "zip2 v30.8h, v17.8h, v16.8h\n" + "ldr q16, [x24], #0x10\n" + "ldr q20, [x23], #0x10\n" + "zip1 v29.8h, v19.8h, v16.8h\n" + "ldr q17, [x24], #0x10\n" + "zip2 v28.8h, v19.8h, v16.8h\n" + "ldr q19, [x23], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v27.8h, v18.8h, v17.8h\n" + "ldr q26, [x21], #0x10\n" + "zip2 v25.8h, v18.8h, v17.8h\n" + "ldr q18, [x22], #0x10\n" + "zip1 v24.8h, v20.8h, v16.8h\n" + "ldr q17, [x20], #0x10\n" + "zip2 v23.8h, v20.8h, v16.8h\n" + "ldr q22, [x21], #0x10\n" + "zip1 v21.8h, v19.8h, v18.8h\n" + "ldr q16, [x20], #0x10\n" + "zip2 v20.8h, v19.8h, v18.8h\n" + "zip1 v19.8h, v26.8h, v17.8h\n" + "str q1, [x27, #0x0]\n" + "zip2 v18.8h, v26.8h, v17.8h\n" + "str q0, [x27, #0x10]\n" + "str q31, [x27, #0x20]\n" + "zip1 v17.8h, v22.8h, v16.8h\n" + "str q30, [x27, #0x30]\n" + "zip2 v16.8h, v22.8h, v16.8h\n" + "str q29, [x27, #0x40]\n" + "str q28, [x27, #0x50]\n" + "str q27, [x27, #0x60]\n" + "str q25, [x27, #0x70]\n" + "str q24, [x27, #0x80]\n" + "str q23, [x27, #0x90]\n" + "str q21, [x27, #0xa0]\n" + "str q20, [x27, #0xb0]\n" + "str q19, [x27, #0xc0]\n" + "str q18, [x27, #0xd0]\n" + "str q17, [x27, #0xe0]\n" + "str q16, [x27, #0xf0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 5f\n" + "4:" // Main row loop: width 4 loop: loop + "ldr d17, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d16, [x26], #0x8\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "ldr d17, [x25], #0x8\n" + "cmp x19, #0x4\n" + "ldr d16, [x24], #0x8\n" + "zip1 v19.8h, v17.8h, v16.8h\n" + "ldr d17, [x23], #0x8\n" + "ldr d16, [x22], #0x8\n" + "zip1 v18.8h, v17.8h, v16.8h\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q20, [x27, #0x0]\n" + "str q19, [x27, #0x40]\n" + "str q18, [x27, #0x80]\n" + "str q16, [x27, #0xc0]\n" + "add x27, x27, #0x10\n" + "bge 4b\n" + "5:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 7f\n" + "6:" // Main row loop: width 1 loop: loop + "ldr h17, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h16, [x26], #0x2\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "ldr h17, [x25], #0x2\n" + "cmp x19, #0x1\n" + "ldr h16, [x24], #0x2\n" + "zip1 v19.8h, v17.8h, v16.8h\n" + "ldr h17, [x23], #0x2\n" + "ldr h16, [x22], #0x2\n" + "zip1 v18.8h, v17.8h, v16.8h\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str s20, [x27, #0x0]\n" + "str s19, [x27, #0x40]\n" + "str s18, [x27, #0x80]\n" + "str s16, [x27, #0xc0]\n" + "add x27, x27, #0x4\n" + "bge 6b\n" + "7:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x100\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 16f\n" + "8:" // Main loop skip + + "9:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add %x[in], x26, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 11f\n" + "10:" // Tail row loop: Column loop + "ldr q18, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q16, [x26], #0x10\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "ldr q19, [x28], #0x10\n" + "cmp x19, #0x10\n" + "zip2 v18.8h, v18.8h, v16.8h\n" + "ldr q16, [x26], #0x10\n" + "str q17, [x27, #0x0]\n" + "zip1 v17.8h, v19.8h, v16.8h\n" + "str q18, [x27, #0x10]\n" + "zip2 v16.8h, v19.8h, v16.8h\n" + "str q17, [x27, #0x20]\n" + "str q16, [x27, #0x30]\n" + "add x27, x27, %x[out_stride]\n" + "bge 10b\n" + "11:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 13f\n" + "12:" // Tail row loop: width 4 loop: loop + "ldr d17, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d16, [x26], #0x8\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q16, [x27, #0x0]\n" + "add x27, x27, #0x10\n" + "cmp x19, #0x4\n" + "bge 12b\n" + "13:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 15f\n" + "14:" // Tail row loop: width 1 loop: loop + "ldr h17, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h16, [x26], #0x2\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str s16, [x27, #0x0]\n" + "add x27, x27, #0x4\n" + "cmp x19, #0x1\n" + "bge 14b\n" + "15:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x40\n" + "cmp %x[height], #0x1\n" + "bge 9b\n" + "16:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<16, 2, true, VLType::None>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp new file mode 100644 index 0000000000..8c8dfd1d0d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 16 * roundup(height, 4) * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q24, [x28], #0x10\n" + "sub x19, x19, #0x20\n" + "ldr q4, [x26], #0x10\n" + "cmp x19, #0x20\n" + "ldr q26, [x25], #0x10\n" + "zip1 v2.8h, v24.8h, v26.8h\n" + "ldr q3, [x28], #0x10\n" + "zip2 v9.8h, v24.8h, v26.8h\n" + "ldr q0, [x26], #0x10\n" + "ldr q22, [x25], #0x10\n" + "zip1 v31.8h, v3.8h, v22.8h\n" + "ldr q23, [x28], #0x10\n" + "zip2 v25.8h, v3.8h, v22.8h\n" + "ldr q22, [x26], #0x10\n" + "ldr q5, [x25], #0x10\n" + "zip1 v17.8h, v23.8h, v5.8h\n" + "ldr q19, [x28], #0x10\n" + "zip2 v20.8h, v23.8h, v5.8h\n" + "ldr q1, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v15.8h, v19.8h, v16.8h\n" + "ldr q8, [x24], #0x10\n" + "zip2 v11.8h, v19.8h, v16.8h\n" + "ldr q26, [x23], #0x10\n" + "ldr q19, [x22], #0x10\n" + "zip1 v5.8h, v4.8h, v8.8h\n" + "ldr q18, [x24], #0x10\n" + "zip2 v6.8h, v4.8h, v8.8h\n" + "ldr q7, [x23], #0x10\n" + "zip1 v27.8h, v2.8h, v5.8h\n" + "ldr q23, [x22], #0x10\n" + "zip2 v8.8h, v2.8h, v5.8h\n" + "ldr q24, [x21], #0x10\n" + "zip1 v12.8h, v9.8h, v6.8h\n" + "ldr q13, [x24], #0x10\n" + "zip2 v16.8h, v9.8h, v6.8h\n" + "ldr q9, [x23], #0x10\n" + "zip1 v29.8h, v0.8h, v18.8h\n" + "ldr q10, [x22], #0x10\n" + "zip1 v14.8h, v31.8h, v29.8h\n" + "ldr q4, [x21], #0x10\n" + "zip2 v21.8h, v31.8h, v29.8h\n" + "ldr q6, [x24], #0x10\n" + "zip2 v18.8h, v0.8h, v18.8h\n" + "ldr q3, [x23], #0x10\n" + "zip1 v0.8h, v25.8h, v18.8h\n" + "ldr q31, [x22], #0x10\n" + "zip2 v29.8h, v25.8h, v18.8h\n" + "ldr q5, [x21], #0x10\n" + "zip1 v28.8h, v26.8h, v24.8h\n" + "ldr q25, [x20], #0x10\n" + "zip2 v26.8h, v26.8h, v24.8h\n" + "ldr q30, [x21], #0x10\n" + "zip1 v24.8h, v7.8h, v4.8h\n" + "zip2 v4.8h, v7.8h, v4.8h\n" + "ldr q18, [x20], #0x10\n" + "zip1 v7.8h, v19.8h, v25.8h\n" + "ldr q2, [x20], #0x10\n" + "zip2 v25.8h, v19.8h, v25.8h\n" + "zip1 v19.8h, v28.8h, v7.8h\n" + "zip2 v7.8h, v28.8h, v7.8h\n" + "ldr q28, [x20], #0x10\n" + "str q27, [x27, #0x0]\n" + "zip1 v27.8h, v26.8h, v25.8h\n" + "zip2 v26.8h, v26.8h, v25.8h\n" + "str q8, [x27, #0x10]\n" + "zip1 v25.8h, v23.8h, v18.8h\n" + "str q12, [x27, #0x20]\n" + "zip1 v8.8h, v24.8h, v25.8h\n" + "str q16, [x27, #0x30]\n" + "zip2 v25.8h, v24.8h, v25.8h\n" + "str q14, [x27, #0x40]\n" + "zip2 v12.8h, v23.8h, v18.8h\n" + "str q21, [x27, #0x50]\n" + "zip1 v21.8h, v4.8h, v12.8h\n" + "str q0, [x27, #0x60]\n" + "zip2 v14.8h, v4.8h, v12.8h\n" + "str q29, [x27, #0x70]\n" + "zip1 v12.8h, v22.8h, v13.8h\n" + "str q19, [x27, #0x80]\n" + "zip1 v24.8h, v17.8h, v12.8h\n" + "str q7, [x27, #0x90]\n" + "zip2 v23.8h, v17.8h, v12.8h\n" + "str q27, [x27, #0xa0]\n" + "zip2 v16.8h, v22.8h, v13.8h\n" + "str q26, [x27, #0xb0]\n" + "zip1 v19.8h, v20.8h, v16.8h\n" + "str q8, [x27, #0xc0]\n" + "zip2 v18.8h, v20.8h, v16.8h\n" + "str q25, [x27, #0xd0]\n" + "zip1 v16.8h, v1.8h, v6.8h\n" + "str q21, [x27, #0xe0]\n" + "zip1 v21.8h, v15.8h, v16.8h\n" + "str q14, [x27, #0xf0]\n" + "add x27, x27, %x[out_stride]\n" + "zip2 v17.8h, v15.8h, v16.8h\n" + "str q24, [x27, #0x0]\n" + "zip2 v16.8h, v1.8h, v6.8h\n" + "str q23, [x27, #0x10]\n" + "zip1 v20.8h, v11.8h, v16.8h\n" + "str q19, [x27, #0x20]\n" + "zip2 v19.8h, v11.8h, v16.8h\n" + "str q18, [x27, #0x30]\n" + "zip1 v18.8h, v9.8h, v5.8h\n" + "str q21, [x27, #0x40]\n" + "zip1 v16.8h, v10.8h, v2.8h\n" + "str q17, [x27, #0x50]\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q20, [x27, #0x60]\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q19, [x27, #0x70]\n" + "zip2 v18.8h, v9.8h, v5.8h\n" + "str q17, [x27, #0x80]\n" + "zip2 v17.8h, v10.8h, v2.8h\n" + "str q16, [x27, #0x90]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xa0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xb0]\n" + "zip1 v18.8h, v3.8h, v30.8h\n" + "zip1 v17.8h, v31.8h, v28.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xc0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xd0]\n" + "zip2 v18.8h, v3.8h, v30.8h\n" + "zip2 v17.8h, v31.8h, v28.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xe0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xf0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.8h, v17.8h, v16.8h\n" + "ldr q18, [x28], #0x10\n" + "zip2 v22.8h, v17.8h, v16.8h\n" + "ldr q21, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v0.8h, v18.8h, v17.8h\n" + "ldr q16, [x24], #0x10\n" + "zip2 v31.8h, v18.8h, v17.8h\n" + "ldr q30, [x23], #0x10\n" + "ldr q29, [x22], #0x10\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "ldr q18, [x24], #0x10\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "ldr q28, [x23], #0x10\n" + "zip1 v27.8h, v19.8h, v17.8h\n" + "ldr q26, [x22], #0x10\n" + "zip2 v20.8h, v19.8h, v17.8h\n" + "ldr q25, [x21], #0x10\n" + "zip1 v19.8h, v22.8h, v16.8h\n" + "ldr q24, [x20], #0x10\n" + "zip2 v23.8h, v22.8h, v16.8h\n" + "ldr q22, [x21], #0x10\n" + "zip1 v17.8h, v21.8h, v18.8h\n" + "zip2 v18.8h, v21.8h, v18.8h\n" + "ldr q21, [x20], #0x10\n" + "zip1 v16.8h, v0.8h, v17.8h\n" + "str q27, [x27, #0x0]\n" + "zip2 v17.8h, v0.8h, v17.8h\n" + "str q20, [x27, #0x10]\n" + "zip1 v20.8h, v31.8h, v18.8h\n" + "str q19, [x27, #0x20]\n" + "zip2 v19.8h, v31.8h, v18.8h\n" + "str q23, [x27, #0x30]\n" + "zip1 v18.8h, v30.8h, v25.8h\n" + "str q16, [x27, #0x40]\n" + "zip1 v16.8h, v29.8h, v24.8h\n" + "str q17, [x27, #0x50]\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q20, [x27, #0x60]\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q19, [x27, #0x70]\n" + "zip2 v18.8h, v30.8h, v25.8h\n" + "str q17, [x27, #0x80]\n" + "zip2 v17.8h, v29.8h, v24.8h\n" + "str q16, [x27, #0x90]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xa0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xb0]\n" + "zip1 v18.8h, v28.8h, v22.8h\n" + "zip1 v17.8h, v26.8h, v21.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xc0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xd0]\n" + "zip2 v18.8h, v28.8h, v22.8h\n" + "zip2 v17.8h, v26.8h, v21.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xe0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0xf0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d17, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d18, [x26], #0x8\n" + "cmp x19, #0x4\n" + "ldr d16, [x25], #0x8\n" + "zip1 v17.8h, v17.8h, v16.8h\n" + "ldr d16, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "zip1 v16.8h, v18.8h, v16.8h\n" + "ldr d20, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "zip1 v18.8h, v17.8h, v16.8h\n" + "zip2 v17.8h, v17.8h, v16.8h\n" + "ldr d16, [x20], #0x8\n" + "str q18, [x27, #0x0]\n" + "zip1 v18.8h, v21.8h, v19.8h\n" + "str q17, [x27, #0x10]\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x80]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x90]\n" + "add x27, x27, #0x20\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h18, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h17, [x26], #0x2\n" + "cmp x19, #0x1\n" + "ldr h16, [x25], #0x2\n" + "zip1 v18.8h, v18.8h, v16.8h\n" + "ldr h16, [x24], #0x2\n" + "ldr h20, [x23], #0x2\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "ldr h19, [x22], #0x2\n" + "ldr h17, [x21], #0x2\n" + "zip1 v18.8h, v18.8h, v16.8h\n" + "ldr h16, [x20], #0x2\n" + "zip1 v17.8h, v20.8h, v17.8h\n" + "str d18, [x27, #0x0]\n" + "zip1 v16.8h, v19.8h, v16.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str d16, [x27, #0x80]\n" + "add x27, x27, #0x8\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x100\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x20\n" + "ldr q19, [x26], #0x10\n" + "cmp x19, #0x20\n" + "ldr q16, [x25], #0x10\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "ldr q18, [x28], #0x10\n" + "zip2 v0.8h, v17.8h, v16.8h\n" + "ldr q31, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v30.8h, v18.8h, v16.8h\n" + "ldr q17, [x28], #0x10\n" + "zip2 v29.8h, v18.8h, v16.8h\n" + "ldr q28, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v27.8h, v17.8h, v16.8h\n" + "ldr q18, [x28], #0x10\n" + "zip2 v26.8h, v17.8h, v16.8h\n" + "ldr q25, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v24.8h, v18.8h, v17.8h\n" + "ldr q16, [x24], #0x10\n" + "zip2 v23.8h, v18.8h, v17.8h\n" + "ldr q22, [x24], #0x10\n" + "zip1 v17.8h, v19.8h, v16.8h\n" + "zip2 v19.8h, v19.8h, v16.8h\n" + "ldr q21, [x24], #0x10\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "zip2 v17.8h, v20.8h, v17.8h\n" + "ldr q20, [x24], #0x10\n" + "zip1 v18.8h, v0.8h, v19.8h\n" + "zip2 v19.8h, v0.8h, v19.8h\n" + "str q16, [x27, #0x0]\n" + "zip1 v16.8h, v31.8h, v22.8h\n" + "str q17, [x27, #0x10]\n" + "zip1 v17.8h, v30.8h, v16.8h\n" + "str q18, [x27, #0x20]\n" + "zip2 v18.8h, v30.8h, v16.8h\n" + "str q19, [x27, #0x30]\n" + "zip2 v16.8h, v31.8h, v22.8h\n" + "str q17, [x27, #0x40]\n" + "zip1 v17.8h, v29.8h, v16.8h\n" + "str q18, [x27, #0x50]\n" + "zip2 v16.8h, v29.8h, v16.8h\n" + "str q17, [x27, #0x60]\n" + "zip1 v17.8h, v28.8h, v21.8h\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, %x[out_stride]\n" + "zip1 v16.8h, v27.8h, v17.8h\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.8h, v27.8h, v17.8h\n" + "zip2 v17.8h, v28.8h, v21.8h\n" + "str q16, [x27, #0x10]\n" + "zip1 v16.8h, v26.8h, v17.8h\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.8h, v26.8h, v17.8h\n" + "str q16, [x27, #0x30]\n" + "zip1 v17.8h, v25.8h, v20.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.8h, v24.8h, v17.8h\n" + "str q16, [x27, #0x50]\n" + "zip2 v17.8h, v25.8h, v20.8h\n" + "zip1 v16.8h, v23.8h, v17.8h\n" + "str q16, [x27, #0x60]\n" + "zip2 v16.8h, v23.8h, v17.8h\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0x10\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q25, [x26], #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v24.8h, v17.8h, v16.8h\n" + "ldr q18, [x28], #0x10\n" + "zip2 v23.8h, v17.8h, v16.8h\n" + "ldr q22, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v21.8h, v18.8h, v17.8h\n" + "ldr q16, [x24], #0x10\n" + "zip2 v20.8h, v18.8h, v17.8h\n" + "ldr q19, [x24], #0x10\n" + "zip1 v18.8h, v25.8h, v16.8h\n" + "zip2 v17.8h, v25.8h, v16.8h\n" + "zip1 v16.8h, v24.8h, v18.8h\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.8h, v24.8h, v18.8h\n" + "str q16, [x27, #0x10]\n" + "zip1 v16.8h, v23.8h, v17.8h\n" + "str q16, [x27, #0x20]\n" + "zip2 v16.8h, v23.8h, v17.8h\n" + "str q16, [x27, #0x30]\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" + "str q16, [x27, #0x40]\n" + "zip2 v16.8h, v21.8h, v17.8h\n" + "str q16, [x27, #0x50]\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [x27, #0x60]\n" + "zip2 v16.8h, v20.8h, v17.8h\n" + "str q16, [x27, #0x70]\n" + "add x27, x27, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d18, [x28], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d17, [x26], #0x8\n" + "cmp x19, #0x4\n" + "ldr d16, [x25], #0x8\n" + "zip1 v18.8h, v18.8h, v16.8h\n" + "ldr d16, [x24], #0x8\n" + "zip1 v17.8h, v17.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [x27, #0x10]\n" + "add x27, x27, #0x20\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h17, [x28], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h18, [x26], #0x2\n" + "cmp x19, #0x1\n" + "ldr h16, [x25], #0x2\n" + "zip1 v17.8h, v17.8h, v16.8h\n" + "ldr h16, [x24], #0x2\n" + "zip1 v16.8h, v18.8h, v16.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x80\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<16, 4, true, VLType::None>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16_2x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..2ecf03c4c1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 16 * roundup(height, 4) * sizeof(bfloat16); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 8f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q19, [x26], #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v24.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v23.4s, v17.4s, v16.4s\n" + "ldr q22, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v21.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v14.4s, v18.4s, v16.4s\n" + "ldr q13, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v12.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v11.4s, v17.4s, v16.4s\n" + "ldr q10, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v9.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v8.4s, v18.4s, v17.4s\n" + "ldr q7, [x23], #0x10\n" + "ldr q6, [x22], #0x10\n" + "zip1 v17.4s, v19.4s, v16.4s\n" + "ldr q20, [x24], #0x10\n" + "zip2 v19.4s, v19.4s, v16.4s\n" + "ldr q5, [x23], #0x10\n" + "zip1 v16.4s, v24.4s, v17.4s\n" + "ldr q4, [x22], #0x10\n" + ".inst 0x0ea16a03 // bfcvtn v3.4h, v16.4s\n" + "ldr q2, [x21], #0x10\n" + "zip2 v17.4s, v24.4s, v17.4s\n" + "ldr q1, [x24], #0x10\n" + "zip1 v16.4s, v23.4s, v19.4s\n" + "ldr q0, [x23], #0x10\n" + ".inst 0x4ea16a23 // bfcvtn2 v3.8h, v17.4s\n" + "ldr q31, [x22], #0x10\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "ldr q30, [x21], #0x10\n" + "zip2 v16.4s, v23.4s, v19.4s\n" + "ldr q29, [x24], #0x10\n" + "zip1 v17.4s, v22.4s, v20.4s\n" + "ldr q28, [x23], #0x10\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "ldr q27, [x22], #0x10\n" + "zip1 v16.4s, v21.4s, v17.4s\n" + "ldr q26, [x21], #0x10\n" + ".inst 0x0ea16a19 // bfcvtn v25.4h, v16.4s\n" + "ldr q24, [x20], #0x10\n" + "zip2 v16.4s, v21.4s, v17.4s\n" + "ldr q23, [x21], #0x10\n" + ".inst 0x4ea16a19 // bfcvtn2 v25.8h, v16.4s\n" + "zip2 v17.4s, v22.4s, v20.4s\n" + "ldr q22, [x20], #0x10\n" + "zip1 v16.4s, v14.4s, v17.4s\n" + "ldr q21, [x20], #0x10\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v14.4s, v17.4s\n" + "ldr q20, [x20], #0x10\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "zip1 v17.4s, v13.4s, v1.4s\n" + "str q3, [x27, #0x0]\n" + "zip1 v16.4s, v12.4s, v17.4s\n" + "str q18, [x27, #0x10]\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "str q25, [x27, #0x20]\n" + "zip2 v16.4s, v12.4s, v17.4s\n" + "str q19, [x27, #0x30]\n" + "zip2 v17.4s, v13.4s, v1.4s\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "str q18, [x27, #0x40]\n" + "zip1 v16.4s, v11.4s, v17.4s\n" + "zip2 v19.4s, v11.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip1 v17.4s, v10.4s, v29.4s\n" + "zip1 v16.4s, v9.4s, v17.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q18, [x27, #0x50]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v9.4s, v17.4s\n" + "zip2 v17.4s, v10.4s, v29.4s\n" + "zip1 v16.4s, v8.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x60]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v8.4s, v17.4s\n" + "zip1 v18.4s, v7.4s, v2.4s\n" + "zip1 v17.4s, v6.4s, v24.4s\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "str q19, [x27, #0x70]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v7.4s, v2.4s\n" + "zip2 v17.4s, v6.4s, v24.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x80]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v5.4s, v30.4s\n" + "zip1 v17.4s, v4.4s, v22.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x90]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v5.4s, v30.4s\n" + "zip2 v17.4s, v4.4s, v22.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xa0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v0.4s, v26.4s\n" + "zip1 v17.4s, v31.4s, v21.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xb0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v0.4s, v26.4s\n" + "zip2 v17.4s, v31.4s, v21.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xc0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v28.4s, v23.4s\n" + "zip1 v17.4s, v27.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xd0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v28.4s, v23.4s\n" + "zip2 v17.4s, v27.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xe0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0xf0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 5f\n" + "4:" // Main row loop: width 4 loop: loop + "ldr q20, [x28], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x26], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x25], #0x10\n" + "zip1 v19.4s, v20.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v25.4s, v20.4s, v17.4s\n" + "ldr q24, [x23], #0x10\n" + "ldr q23, [x22], #0x10\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "ldr q22, [x21], #0x10\n" + "zip2 v21.4s, v18.4s, v16.4s\n" + "ldr q20, [x20], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v19.4s, v17.4s\n" + "zip1 v16.4s, v25.4s, v21.4s\n" + ".inst 0x4ea16a32 // bfcvtn2 v18.8h, v17.4s\n" + "str q18, [x27, #0x0]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v25.4s, v21.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v20.4s\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "str q19, [x27, #0x10]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x80]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0x90]\n" + "add x27, x27, #0x20\n" + "bge 4b\n" + "5:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 7f\n" + "6:" // Main row loop: width 1 loop: loop + "ldr s18, [x28], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s17, [x26], #0x4\n" + "cmp x19, #0x1\n" + "ldr s16, [x25], #0x4\n" + "zip1 v18.4s, v18.4s, v16.4s\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + "ldr s19, [x22], #0x4\n" + "ldr s17, [x21], #0x4\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + "ldr s18, [x20], #0x4\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v17.4s, v20.4s, v17.4s\n" + "str d16, [x27, #0x0]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d16, [x27, #0x80]\n" + "add x27, x27, #0x8\n" + "bge 6b\n" + "7:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x100\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 16f\n" + "8:" // Main loop skip + + "9:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 11f\n" + "10:" // Tail row loop: Column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v1.4s, v17.4s, v16.4s\n" + "ldr q0, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v31.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v30.4s, v18.4s, v16.4s\n" + "ldr q29, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v28.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v27.4s, v17.4s, v16.4s\n" + "ldr q26, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v25.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v24.4s, v18.4s, v17.4s\n" + "ldr q23, [x24], #0x10\n" + "zip1 v17.4s, v20.4s, v16.4s\n" + "zip2 v22.4s, v20.4s, v16.4s\n" + "ldr q21, [x24], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + "zip2 v19.4s, v19.4s, v17.4s\n" + "ldr q20, [x24], #0x10\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip1 v16.4s, v1.4s, v22.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v16.4s, v1.4s, v22.4s\n" + ".inst 0x4ea16a71 // bfcvtn2 v17.8h, v19.4s\n" + "str q17, [x27, #0x0]\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "zip1 v17.4s, v0.4s, v23.4s\n" + "str q18, [x27, #0x10]\n" + "zip1 v16.4s, v31.4s, v17.4s\n" + "zip2 v19.4s, v31.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v0.4s, v23.4s\n" + "zip1 v16.4s, v30.4s, v17.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q18, [x27, #0x20]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v30.4s, v17.4s\n" + "zip1 v17.4s, v29.4s, v21.4s\n" + "zip1 v16.4s, v28.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x30]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v28.4s, v17.4s\n" + "zip2 v17.4s, v29.4s, v21.4s\n" + "zip1 v16.4s, v27.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x40]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v27.4s, v17.4s\n" + "zip1 v17.4s, v26.4s, v20.4s\n" + "zip1 v16.4s, v25.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x50]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v17.4s, v25.4s, v17.4s\n" + "zip2 v18.4s, v26.4s, v20.4s\n" + "zip1 v16.4s, v24.4s, v18.4s\n" + ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n" + "str q19, [x27, #0x60]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v24.4s, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0x70]\n" + "add x27, x27, %x[out_stride]\n" + "bge 10b\n" + "11:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 13f\n" + "12:" // Tail row loop: width 4 loop: loop + "ldr q19, [x28], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x26], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x25], #0x10\n" + "zip1 v21.4s, v19.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v20.4s, v19.4s, v17.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "zip2 v19.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v21.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v21.4s, v17.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + ".inst 0x4ea16a32 // bfcvtn2 v18.8h, v17.4s\n" + "str q18, [x27, #0x0]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0x10]\n" + "add x27, x27, #0x20\n" + "bge 12b\n" + "13:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 15f\n" + "14:" // Tail row loop: width 1 loop: loop + "ldr s17, [x28], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s18, [x26], #0x4\n" + "cmp x19, #0x1\n" + "ldr s16, [x25], #0x4\n" + "zip1 v17.4s, v17.4s, v16.4s\n" + "ldr s16, [x24], #0x4\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 14b\n" + "15:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x80\n" + "cmp %x[height], #0x1\n" + "bge 9b\n" + "16:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace +template<> +void Transform<16, 4, true, VLType::None>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_16_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp new file mode 100644 index 0000000000..9f3ab95108 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 12 * height * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q29, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q18, [x22], #0x10\n" + "dup v28.2d, v18.d[1]\n" + "ldr q16, [x24], #0x10\n" + "cmp x19, #0x18\n" + "dup v27.2d, v16.d[0]\n" + "ldr q17, [x24], #0x10\n" + "dup v26.2d, v16.d[1]\n" + "ldr q16, [x22], #0x10\n" + "mov v27.d[1], v18.d[0]\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "mov v26.d[1], v17.d[0]\n" + "ldr q23, [x22], #0x10\n" + "mov v28.d[1], v16.d[0]\n" + "dup v22.2d, v17.d[1]\n" + "ldr q17, [x21], #0x10\n" + "dup v21.2d, v24.d[1]\n" + "ldr q20, [x20], #0x10\n" + "mov v22.d[1], v16.d[1]\n" + "ldr q16, [x21], #0x10\n" + "dup v19.2d, v17.d[0]\n" + "dup v18.2d, v17.d[1]\n" + "ldr q17, [x20], #0x10\n" + "mov v19.d[1], v24.d[0]\n" + "str q29, [x23, #0x0]\n" + "mov v21.d[1], v20.d[0]\n" + "str q27, [x23, #0x10]\n" + "str q28, [x23, #0x20]\n" + "mov v18.d[1], v16.d[0]\n" + "dup v16.2d, v16.d[1]\n" + "str q25, [x23, #0x30]\n" + "mov v16.d[1], v20.d[1]\n" + "str q19, [x23, #0x40]\n" + "str q21, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "str q26, [x23, #0x0]\n" + "str q22, [x23, #0x10]\n" + "str q23, [x23, #0x20]\n" + "str q18, [x23, #0x30]\n" + "str q16, [x23, #0x40]\n" + "str q17, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q22, [x24], #0x10\n" + "sub x19, x19, #0xc\n" + "ldr q16, [x22], #0x10\n" + "dup v21.2d, v16.d[1]\n" + "ldr d20, [x24], #0x8\n" + "cmp x19, #0xc\n" + "mov v20.d[1], v16.d[0]\n" + "ldr d16, [x22], #0x8\n" + "ldr q19, [x21], #0x10\n" + "mov v21.d[1], v16.d[0]\n" + "ldr d18, [x21], #0x8\n" + "ldr q16, [x20], #0x10\n" + "mov v18.d[1], v16.d[0]\n" + "ldr d17, [x20], #0x8\n" + "dup v16.2d, v16.d[1]\n" + "str q22, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "mov v16.d[1], v17.d[0]\n" + "str q21, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q18, [x23, #0x40]\n" + "str q16, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d19, [x24], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d18, [x22], #0x8\n" + "cmp x19, #0x4\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "str d19, [x23, #0x0]\n" + "str d18, [x23, #0x18]\n" + "str d17, [x23, #0x30]\n" + "str d16, [x23, #0x48]\n" + "add x23, x23, #0x8\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h19, [x24], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h18, [x22], #0x2\n" + "cmp x19, #0x1\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "str h19, [x23, #0x0]\n" + "str h18, [x23, #0x18]\n" + "str h17, [x23, #0x30]\n" + "str h16, [x23, #0x48]\n" + "add x23, x23, #0x2\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q19, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "cmp x19, #0x18\n" + "ldr q16, [x24], #0x10\n" + "dup v18.2d, v16.d[0]\n" + "ldr q17, [x24], #0x10\n" + "dup v16.2d, v16.d[1]\n" + "str q19, [x23, #0x0]\n" + "str d18, [x23, #0x10]\n" + "mov v16.d[1], v17.d[0]\n" + "add x23, x23, %x[out_stride]\n" + "str q16, [x23, #0x0]\n" + "dup v16.2d, v17.d[1]\n" + "str d16, [x23, #0x10]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr q17, [x24], #0x10\n" + "sub x19, x19, #0xc\n" + "cmp x19, #0xc\n" + "ldr d16, [x24], #0x8\n" + "str q17, [x23, #0x0]\n" + "str d16, [x23, #0x10]\n" + "add x23, x23, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0x4\n" + "cmp x19, #0x4\n" + "str d16, [x23, #0x0]\n" + "add x23, x23, #0x8\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h16, [x24], #0x2\n" + "sub x19, x19, #0x1\n" + "cmp x19, #0x1\n" + "str h16, [x23, #0x0]\n" + "add x23, x23, #0x2\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x18\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace + +template<> +void Transform<6, 1, true, VLType::None>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_24( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 2, + stride * sizeof(float), + (kmax-k0) + ); +} + +template<> +void Transform<12, 1, true, VLType::None>( + int16_t *out, const int16_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_24( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int16_t) / 2, + stride * sizeof(int16_t), + (kmax-k0) + ); +} + +template<> +void Transform<12, 1, true, VLType::None>( + uint16_t *out, const uint16_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_24( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint16_t) / 2, + stride * sizeof(uint16_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..101be7e843 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp @@ -0,0 +1,787 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 24 * roundup(height, 4) * sizeof(bfloat16); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q3, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q27, [x26], #0x10\n" + "cmp x19, #0x18\n" + "ldr q26, [x25], #0x10\n" + "zip1 v28.4s, v3.4s, v26.4s\n" + "ldr q1, [x28], #0x10\n" + "zip2 v12.4s, v3.4s, v26.4s\n" + "ldr q26, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v11.4s, v1.4s, v17.4s\n" + "ldr q31, [x28], #0x10\n" + "zip2 v23.4s, v1.4s, v17.4s\n" + "ldr q3, [x26], #0x10\n" + "ldr q1, [x25], #0x10\n" + "zip1 v10.4s, v31.4s, v1.4s\n" + "ldr q14, [x28], #0x10\n" + "zip2 v17.4s, v31.4s, v1.4s\n" + "ldr q6, [x26], #0x10\n" + "ldr q1, [x25], #0x10\n" + "zip1 v5.4s, v14.4s, v1.4s\n" + "ldr q0, [x28], #0x10\n" + "zip2 v8.4s, v14.4s, v1.4s\n" + "ldr q2, [x26], #0x10\n" + "ldr q30, [x25], #0x10\n" + "zip1 v15.4s, v0.4s, v30.4s\n" + "ldr q14, [x28], #0x10\n" + "zip2 v1.4s, v0.4s, v30.4s\n" + "ldr q0, [x26], #0x10\n" + "ldr q29, [x25], #0x10\n" + "zip1 v19.4s, v14.4s, v29.4s\n" + "ldr q25, [x24], #0x10\n" + "zip2 v30.4s, v14.4s, v29.4s\n" + "ldr q7, [x23], #0x10\n" + "ldr q31, [x22], #0x10\n" + "zip1 v20.4s, v27.4s, v25.4s\n" + "ldr q24, [x24], #0x10\n" + "zip2 v4.4s, v27.4s, v25.4s\n" + "ldr q22, [x23], #0x10\n" + "zip1 v14.4s, v28.4s, v20.4s\n" + "ldr q13, [x22], #0x10\n" + ".inst 0x0ea169d0 // bfcvtn v16.4h, v14.4s\n" + "ldr q29, [x21], #0x10\n" + "zip2 v21.4s, v28.4s, v20.4s\n" + "ldr q27, [x24], #0x10\n" + "zip1 v9.4s, v12.4s, v4.4s\n" + "ldr q14, [x23], #0x10\n" + ".inst 0x4ea16ab0 // bfcvtn2 v16.8h, v21.4s\n" + "ldr q21, [x22], #0x10\n" + ".inst 0x0ea16929 // bfcvtn v9.4h, v9.4s\n" + "ldr q18, [x21], #0x10\n" + "zip2 v25.4s, v12.4s, v4.4s\n" + "ldr q4, [x24], #0x10\n" + "zip1 v28.4s, v26.4s, v24.4s\n" + "ldr q20, [x23], #0x10\n" + ".inst 0x4ea16b29 // bfcvtn2 v9.8h, v25.4s\n" + "ldr q12, [x22], #0x10\n" + "zip1 v25.4s, v11.4s, v28.4s\n" + ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n" + "zip2 v11.4s, v11.4s, v28.4s\n" + "ldr q28, [x24], #0x10\n" + "zip2 v26.4s, v26.4s, v24.4s\n" + "ldr q24, [x23], #0x10\n" + ".inst 0x4ea16979 // bfcvtn2 v25.8h, v11.4s\n" + "zip1 v11.4s, v23.4s, v26.4s\n" + ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n" + "zip2 v23.4s, v23.4s, v26.4s\n" + "ldr q26, [x24], #0x10\n" + ".inst 0x4ea16aeb // bfcvtn2 v11.8h, v23.4s\n" + "zip1 v23.4s, v3.4s, v27.4s\n" + "zip2 v27.4s, v3.4s, v27.4s\n" + "zip1 v3.4s, v10.4s, v23.4s\n" + ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n" + "zip2 v10.4s, v10.4s, v23.4s\n" + "ldr q23, [x23], #0x10\n" + ".inst 0x4ea16943 // bfcvtn2 v3.8h, v10.4s\n" + "zip1 v10.4s, v17.4s, v27.4s\n" + ".inst 0x0ea1694a // bfcvtn v10.4h, v10.4s\n" + "zip2 v27.4s, v17.4s, v27.4s\n" + "ldr q17, [x22], #0x10\n" + ".inst 0x4ea16b6a // bfcvtn2 v10.8h, v27.4s\n" + "zip1 v27.4s, v6.4s, v4.4s\n" + "zip2 v6.4s, v6.4s, v4.4s\n" + "zip1 v4.4s, v5.4s, v27.4s\n" + ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" + "zip2 v27.4s, v5.4s, v27.4s\n" + "ldr q5, [x22], #0x10\n" + ".inst 0x4ea16b64 // bfcvtn2 v4.8h, v27.4s\n" + "zip1 v27.4s, v8.4s, v6.4s\n" + ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n" + "zip2 v6.4s, v8.4s, v6.4s\n" + "ldr q8, [x21], #0x10\n" + ".inst 0x4ea168db // bfcvtn2 v27.8h, v6.4s\n" + "zip1 v6.4s, v2.4s, v28.4s\n" + "zip2 v2.4s, v2.4s, v28.4s\n" + "zip1 v28.4s, v15.4s, v6.4s\n" + ".inst 0x0ea16b9c // bfcvtn v28.4h, v28.4s\n" + "zip2 v6.4s, v15.4s, v6.4s\n" + "ldr q15, [x21], #0x10\n" + ".inst 0x4ea168dc // bfcvtn2 v28.8h, v6.4s\n" + "zip1 v6.4s, v1.4s, v2.4s\n" + ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n" + "zip2 v2.4s, v1.4s, v2.4s\n" + "ldr q1, [x21], #0x10\n" + ".inst 0x4ea16846 // bfcvtn2 v6.8h, v2.4s\n" + "zip1 v2.4s, v0.4s, v26.4s\n" + "zip2 v26.4s, v0.4s, v26.4s\n" + "zip1 v0.4s, v19.4s, v2.4s\n" + ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" + "zip2 v19.4s, v19.4s, v2.4s\n" + "ldr q2, [x21], #0x10\n" + ".inst 0x4ea16a60 // bfcvtn2 v0.8h, v19.4s\n" + "zip1 v19.4s, v30.4s, v26.4s\n" + ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" + "zip2 v30.4s, v30.4s, v26.4s\n" + "ldr q26, [x20], #0x10\n" + ".inst 0x4ea16bd3 // bfcvtn2 v19.8h, v30.4s\n" + "zip1 v30.4s, v7.4s, v29.4s\n" + "zip2 v29.4s, v7.4s, v29.4s\n" + "zip1 v7.4s, v22.4s, v18.4s\n" + "zip2 v18.4s, v22.4s, v18.4s\n" + "zip1 v22.4s, v31.4s, v26.4s\n" + "zip2 v26.4s, v31.4s, v26.4s\n" + "zip1 v31.4s, v30.4s, v22.4s\n" + ".inst 0x0ea16bff // bfcvtn v31.4h, v31.4s\n" + "zip2 v30.4s, v30.4s, v22.4s\n" + "ldr q22, [x20], #0x10\n" + ".inst 0x4ea16bdf // bfcvtn2 v31.8h, v30.4s\n" + "zip1 v30.4s, v29.4s, v26.4s\n" + ".inst 0x0ea16bde // bfcvtn v30.4h, v30.4s\n" + "zip2 v26.4s, v29.4s, v26.4s\n" + "ldr q29, [x20], #0x10\n" + ".inst 0x4ea16b5e // bfcvtn2 v30.8h, v26.4s\n" + "zip1 v26.4s, v13.4s, v22.4s\n" + "zip2 v13.4s, v13.4s, v22.4s\n" + "zip1 v22.4s, v7.4s, v26.4s\n" + ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" + "zip2 v7.4s, v7.4s, v26.4s\n" + "ldr q26, [x20], #0x10\n" + ".inst 0x4ea168f6 // bfcvtn2 v22.8h, v7.4s\n" + "zip1 v7.4s, v18.4s, v13.4s\n" + ".inst 0x0ea168e7 // bfcvtn v7.4h, v7.4s\n" + "zip2 v13.4s, v18.4s, v13.4s\n" + "ldr q18, [x20], #0x10\n" + ".inst 0x4ea169a7 // bfcvtn2 v7.8h, v13.4s\n" + "ldr q13, [x20], #0x10\n" + "str q16, [x27, #0x0]\n" + "zip1 v16.4s, v14.4s, v8.4s\n" + "zip2 v8.4s, v14.4s, v8.4s\n" + "str q9, [x27, #0x10]\n" + "zip1 v9.4s, v21.4s, v29.4s\n" + "str q25, [x27, #0x20]\n" + "zip1 v25.4s, v16.4s, v9.4s\n" + "str q11, [x27, #0x30]\n" + ".inst 0x0ea16b2e // bfcvtn v14.4h, v25.4s\n" + "str q3, [x27, #0x40]\n" + "zip2 v25.4s, v16.4s, v9.4s\n" + "str q10, [x27, #0x50]\n" + "zip2 v29.4s, v21.4s, v29.4s\n" + "str q4, [x27, #0x60]\n" + ".inst 0x4ea16b2e // bfcvtn2 v14.8h, v25.4s\n" + "str q27, [x27, #0x70]\n" + "zip1 v27.4s, v8.4s, v29.4s\n" + "str q28, [x27, #0x80]\n" + ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n" + "str q6, [x27, #0x90]\n" + "zip2 v16.4s, v8.4s, v29.4s\n" + "str q0, [x27, #0xa0]\n" + "zip1 v0.4s, v20.4s, v15.4s\n" + "str q19, [x27, #0xb0]\n" + ".inst 0x4ea16a1b // bfcvtn2 v27.8h, v16.4s\n" + "str q31, [x27, #0xc0]\n" + "zip1 v29.4s, v12.4s, v26.4s\n" + "str q30, [x27, #0xd0]\n" + "zip1 v28.4s, v0.4s, v29.4s\n" + "str q22, [x27, #0xe0]\n" + ".inst 0x0ea16b83 // bfcvtn v3.4h, v28.4s\n" + "str q7, [x27, #0xf0]\n" + "zip2 v22.4s, v0.4s, v29.4s\n" + "str q14, [x27, #0x100]\n" + "zip2 v19.4s, v20.4s, v15.4s\n" + "str q27, [x27, #0x110]\n" + ".inst 0x4ea16ac3 // bfcvtn2 v3.8h, v22.4s\n" + "str q3, [x27, #0x120]\n" + "zip2 v4.4s, v12.4s, v26.4s\n" + "zip1 v20.4s, v24.4s, v1.4s\n" + "zip1 v22.4s, v19.4s, v4.4s\n" + ".inst 0x0ea16ad9 // bfcvtn v25.4h, v22.4s\n" + "zip2 v6.4s, v19.4s, v4.4s\n" + "zip1 v22.4s, v17.4s, v18.4s\n" + ".inst 0x4ea168d9 // bfcvtn2 v25.8h, v6.4s\n" + "str q25, [x27, #0x130]\n" + "zip1 v3.4s, v20.4s, v22.4s\n" + "zip2 v22.4s, v20.4s, v22.4s\n" + ".inst 0x0ea16864 // bfcvtn v4.4h, v3.4s\n" + "zip2 v15.4s, v24.4s, v1.4s\n" + "zip2 v17.4s, v17.4s, v18.4s\n" + ".inst 0x4ea16ac4 // bfcvtn2 v4.8h, v22.4s\n" + "str q4, [x27, #0x140]\n" + "zip1 v16.4s, v15.4s, v17.4s\n" + "zip2 v8.4s, v15.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v23.4s, v2.4s\n" + "zip1 v17.4s, v5.4s, v13.4s\n" + ".inst 0x4ea16910 // bfcvtn2 v16.8h, v8.4s\n" + "str q16, [x27, #0x150]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v10.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v23.4s, v2.4s\n" + "zip2 v17.4s, v5.4s, v13.4s\n" + ".inst 0x4ea16950 // bfcvtn2 v16.8h, v10.4s\n" + "str q16, [x27, #0x160]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0x170]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: width 16 loop: loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q19, [x26], #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v24.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v23.4s, v17.4s, v16.4s\n" + "ldr q22, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v21.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v14.4s, v18.4s, v16.4s\n" + "ldr q13, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v12.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v11.4s, v17.4s, v16.4s\n" + "ldr q10, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v9.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v8.4s, v18.4s, v17.4s\n" + "ldr q7, [x23], #0x10\n" + "ldr q6, [x22], #0x10\n" + "zip1 v17.4s, v19.4s, v16.4s\n" + "ldr q20, [x24], #0x10\n" + "zip2 v19.4s, v19.4s, v16.4s\n" + "ldr q5, [x23], #0x10\n" + "zip1 v16.4s, v24.4s, v17.4s\n" + "ldr q4, [x22], #0x10\n" + ".inst 0x0ea16a03 // bfcvtn v3.4h, v16.4s\n" + "ldr q2, [x21], #0x10\n" + "zip2 v17.4s, v24.4s, v17.4s\n" + "ldr q1, [x24], #0x10\n" + "zip1 v16.4s, v23.4s, v19.4s\n" + "ldr q0, [x23], #0x10\n" + ".inst 0x4ea16a23 // bfcvtn2 v3.8h, v17.4s\n" + "ldr q31, [x22], #0x10\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "ldr q30, [x21], #0x10\n" + "zip2 v16.4s, v23.4s, v19.4s\n" + "ldr q29, [x24], #0x10\n" + "zip1 v17.4s, v22.4s, v20.4s\n" + "ldr q28, [x23], #0x10\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "ldr q27, [x22], #0x10\n" + "zip1 v16.4s, v21.4s, v17.4s\n" + "ldr q26, [x21], #0x10\n" + ".inst 0x0ea16a19 // bfcvtn v25.4h, v16.4s\n" + "ldr q24, [x20], #0x10\n" + "zip2 v16.4s, v21.4s, v17.4s\n" + "ldr q23, [x21], #0x10\n" + ".inst 0x4ea16a19 // bfcvtn2 v25.8h, v16.4s\n" + "zip2 v17.4s, v22.4s, v20.4s\n" + "ldr q22, [x20], #0x10\n" + "zip1 v16.4s, v14.4s, v17.4s\n" + "ldr q21, [x20], #0x10\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v14.4s, v17.4s\n" + "ldr q20, [x20], #0x10\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "zip1 v17.4s, v13.4s, v1.4s\n" + "str q3, [x27, #0x0]\n" + "zip1 v16.4s, v12.4s, v17.4s\n" + "str q18, [x27, #0x10]\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "str q25, [x27, #0x20]\n" + "zip2 v16.4s, v12.4s, v17.4s\n" + "str q19, [x27, #0x30]\n" + "zip2 v17.4s, v13.4s, v1.4s\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "str q18, [x27, #0x40]\n" + "zip1 v16.4s, v11.4s, v17.4s\n" + "zip2 v19.4s, v11.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip1 v17.4s, v10.4s, v29.4s\n" + "zip1 v16.4s, v9.4s, v17.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q18, [x27, #0x50]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v9.4s, v17.4s\n" + "zip2 v17.4s, v10.4s, v29.4s\n" + "zip1 v16.4s, v8.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x60]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v8.4s, v17.4s\n" + "zip1 v18.4s, v7.4s, v2.4s\n" + "zip1 v17.4s, v6.4s, v24.4s\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "str q19, [x27, #0x70]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v7.4s, v2.4s\n" + "zip2 v17.4s, v6.4s, v24.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xc0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v5.4s, v30.4s\n" + "zip1 v17.4s, v4.4s, v22.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xd0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v5.4s, v30.4s\n" + "zip2 v17.4s, v4.4s, v22.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xe0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v0.4s, v26.4s\n" + "zip1 v17.4s, v31.4s, v21.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xf0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v0.4s, v26.4s\n" + "zip2 v17.4s, v31.4s, v21.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x100]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v18.4s, v28.4s, v23.4s\n" + "zip1 v17.4s, v27.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x110]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v28.4s, v23.4s\n" + "zip2 v17.4s, v27.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0x120]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0x130]\n" + "add x27, x27, #0x80\n" + "bge 4b\n" + "5:" // Main row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr q20, [x28], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x26], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x25], #0x10\n" + "zip1 v19.4s, v20.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v25.4s, v20.4s, v17.4s\n" + "ldr q24, [x23], #0x10\n" + "ldr q23, [x22], #0x10\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "ldr q22, [x21], #0x10\n" + "zip2 v21.4s, v18.4s, v16.4s\n" + "ldr q20, [x20], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v19.4s, v17.4s\n" + "zip1 v16.4s, v25.4s, v21.4s\n" + ".inst 0x4ea16a32 // bfcvtn2 v18.8h, v17.4s\n" + "str q18, [x27, #0x0]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v16.4s, v25.4s, v21.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v20.4s\n" + ".inst 0x4ea16a13 // bfcvtn2 v19.8h, v16.4s\n" + "str q19, [x27, #0x10]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v20.4s\n" + ".inst 0x4ea16a70 // bfcvtn2 v16.8h, v19.4s\n" + "str q16, [x27, #0xc0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q16, [x27, #0xd0]\n" + "add x27, x27, #0x20\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr s18, [x28], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s17, [x26], #0x4\n" + "cmp x19, #0x1\n" + "ldr s16, [x25], #0x4\n" + "zip1 v18.4s, v18.4s, v16.4s\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + "ldr s19, [x22], #0x4\n" + "ldr s17, [x21], #0x4\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + "ldr s18, [x20], #0x4\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "zip1 v17.4s, v20.4s, v17.4s\n" + "str d16, [x27, #0x0]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d16, [x27, #0xc0]\n" + "add x27, x27, #0x8\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x180\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x28, %x[in]\n" + "mov x27, %x[out]\n" + "add x26, x28, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0x18\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v9.4s, v17.4s, v16.4s\n" + "ldr q8, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v7.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v6.4s, v18.4s, v16.4s\n" + "ldr q5, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v4.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v3.4s, v17.4s, v16.4s\n" + "ldr q2, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v1.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v0.4s, v18.4s, v16.4s\n" + "ldr q31, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v30.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v29.4s, v17.4s, v16.4s\n" + "ldr q28, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v27.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v26.4s, v18.4s, v17.4s\n" + "ldr q25, [x24], #0x10\n" + "zip1 v17.4s, v20.4s, v16.4s\n" + "zip2 v24.4s, v20.4s, v16.4s\n" + "ldr q23, [x24], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + "zip2 v17.4s, v19.4s, v17.4s\n" + "ldr q22, [x24], #0x10\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip1 v16.4s, v9.4s, v24.4s\n" + "ldr q21, [x24], #0x10\n" + ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "ldr q20, [x24], #0x10\n" + "zip2 v16.4s, v9.4s, v24.4s\n" + "zip1 v17.4s, v8.4s, v25.4s\n" + "str q19, [x27, #0x0]\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "str q18, [x27, #0x10]\n" + "zip1 v16.4s, v7.4s, v17.4s\n" + "zip2 v19.4s, v7.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v8.4s, v25.4s\n" + "zip1 v16.4s, v6.4s, v17.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q18, [x27, #0x20]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v6.4s, v17.4s\n" + "zip1 v17.4s, v5.4s, v23.4s\n" + "zip1 v16.4s, v4.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x30]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v4.4s, v17.4s\n" + "zip2 v17.4s, v5.4s, v23.4s\n" + "zip1 v16.4s, v3.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x40]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v3.4s, v17.4s\n" + "zip1 v17.4s, v2.4s, v22.4s\n" + "zip1 v16.4s, v1.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x50]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v1.4s, v17.4s\n" + "zip2 v17.4s, v2.4s, v22.4s\n" + "zip1 v16.4s, v0.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x60]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v0.4s, v17.4s\n" + "zip1 v17.4s, v31.4s, v21.4s\n" + "zip1 v16.4s, v30.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x70]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v30.4s, v17.4s\n" + "zip2 v17.4s, v31.4s, v21.4s\n" + "zip1 v16.4s, v29.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x80]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v29.4s, v17.4s\n" + "zip1 v17.4s, v28.4s, v20.4s\n" + "zip1 v16.4s, v27.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x90]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v17.4s, v27.4s, v17.4s\n" + "zip2 v18.4s, v28.4s, v20.4s\n" + "zip1 v16.4s, v26.4s, v18.4s\n" + ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n" + "str q19, [x27, #0xa0]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v26.4s, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0xb0]\n" + "add x27, x27, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 15f\n" + "14:" // Tail row loop: width 16 loop: loop + "ldr q17, [x28], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q20, [x26], #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v19.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v1.4s, v17.4s, v16.4s\n" + "ldr q0, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v31.4s, v18.4s, v16.4s\n" + "ldr q17, [x28], #0x10\n" + "zip2 v30.4s, v18.4s, v16.4s\n" + "ldr q29, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v28.4s, v17.4s, v16.4s\n" + "ldr q18, [x28], #0x10\n" + "zip2 v27.4s, v17.4s, v16.4s\n" + "ldr q26, [x26], #0x10\n" + "ldr q17, [x25], #0x10\n" + "zip1 v25.4s, v18.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v24.4s, v18.4s, v17.4s\n" + "ldr q23, [x24], #0x10\n" + "zip1 v17.4s, v20.4s, v16.4s\n" + "zip2 v22.4s, v20.4s, v16.4s\n" + "ldr q21, [x24], #0x10\n" + "zip1 v16.4s, v19.4s, v17.4s\n" + "zip2 v19.4s, v19.4s, v17.4s\n" + "ldr q20, [x24], #0x10\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip1 v16.4s, v1.4s, v22.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v16.4s, v1.4s, v22.4s\n" + ".inst 0x4ea16a71 // bfcvtn2 v17.8h, v19.4s\n" + "str q17, [x27, #0x0]\n" + ".inst 0x4ea16a12 // bfcvtn2 v18.8h, v16.4s\n" + "zip1 v17.4s, v0.4s, v23.4s\n" + "str q18, [x27, #0x10]\n" + "zip1 v16.4s, v31.4s, v17.4s\n" + "zip2 v19.4s, v31.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v0.4s, v23.4s\n" + "zip1 v16.4s, v30.4s, v17.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + "str q18, [x27, #0x20]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v30.4s, v17.4s\n" + "zip1 v17.4s, v29.4s, v21.4s\n" + "zip1 v16.4s, v28.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x30]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v28.4s, v17.4s\n" + "zip2 v17.4s, v29.4s, v21.4s\n" + "zip1 v16.4s, v27.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x40]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v18.4s, v27.4s, v17.4s\n" + "zip1 v17.4s, v26.4s, v20.4s\n" + "zip1 v16.4s, v25.4s, v17.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + "str q19, [x27, #0x50]\n" + ".inst 0x0ea16a13 // bfcvtn v19.4h, v16.4s\n" + "zip2 v17.4s, v25.4s, v17.4s\n" + "zip2 v18.4s, v26.4s, v20.4s\n" + "zip1 v16.4s, v24.4s, v18.4s\n" + ".inst 0x4ea16a33 // bfcvtn2 v19.8h, v17.4s\n" + "str q19, [x27, #0x60]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v24.4s, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0x70]\n" + "add x27, x27, #0x80\n" + "bge 14b\n" + "15:" // Tail row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr q19, [x28], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x26], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x25], #0x10\n" + "zip1 v21.4s, v19.4s, v17.4s\n" + "ldr q16, [x24], #0x10\n" + "zip2 v20.4s, v19.4s, v17.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "zip2 v19.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v21.4s, v17.4s\n" + ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n" + "zip2 v17.4s, v21.4s, v17.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + ".inst 0x4ea16a32 // bfcvtn2 v18.8h, v17.4s\n" + "str q18, [x27, #0x0]\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q17, [x27, #0x10]\n" + "add x27, x27, #0x20\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr s17, [x28], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s18, [x26], #0x4\n" + "cmp x19, #0x1\n" + "ldr s16, [x25], #0x4\n" + "zip1 v17.4s, v17.4s, v16.4s\n" + "ldr s16, [x24], #0x4\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0xc0\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace +template<> +void Transform<24, 4, true, VLType::None>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_24_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp new file mode 100644 index 0000000000..0a628d372e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 12 * height * sizeof(float); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q17, [x24], #0x10\n" + "shll v3.4s, v17.4h, #0x10\n" + "ldr q16, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "shll2 v23.4s, v17.8h, #0x10\n" + "ldr q17, [x22], #0x10\n" + "cmp x19, #0x18\n" + "shll v22.4s, v16.4h, #0x10\n" + "ldr q19, [x24], #0x10\n" + "shll2 v2.4s, v16.8h, #0x10\n" + "ldr q16, [x22], #0x10\n" + "shll v21.4s, v17.4h, #0x10\n" + "ldr q18, [x21], #0x10\n" + "shll2 v1.4s, v17.8h, #0x10\n" + "ldr q0, [x20], #0x10\n" + "shll v31.4s, v19.4h, #0x10\n" + "ldr q17, [x22], #0x10\n" + "shll2 v30.4s, v19.8h, #0x10\n" + "shll v29.4s, v16.4h, #0x10\n" + "ldr q20, [x21], #0x10\n" + "shll2 v28.4s, v16.8h, #0x10\n" + "ldr q27, [x20], #0x10\n" + "shll v19.4s, v18.4h, #0x10\n" + "ldr q16, [x21], #0x10\n" + "shll v26.4s, v17.4h, #0x10\n" + "shll2 v25.4s, v17.8h, #0x10\n" + "ldr q24, [x20], #0x10\n" + "shll2 v18.4s, v18.8h, #0x10\n" + "str q3, [x23, #0x0]\n" + "shll v17.4s, v20.4h, #0x10\n" + "str q23, [x23, #0x10]\n" + "shll2 v23.4s, v20.8h, #0x10\n" + "str q22, [x23, #0x20]\n" + "shll v22.4s, v16.4h, #0x10\n" + "str q21, [x23, #0x30]\n" + "shll2 v21.4s, v16.8h, #0x10\n" + "str q1, [x23, #0x40]\n" + "shll v16.4s, v0.4h, #0x10\n" + "str q29, [x23, #0x50]\n" + "shll2 v20.4s, v0.8h, #0x10\n" + "str q19, [x23, #0x60]\n" + "shll v19.4s, v27.4h, #0x10\n" + "str q18, [x23, #0x70]\n" + "shll2 v18.4s, v27.8h, #0x10\n" + "str q17, [x23, #0x80]\n" + "shll v17.4s, v24.4h, #0x10\n" + "str q16, [x23, #0x90]\n" + "shll2 v16.4s, v24.8h, #0x10\n" + "str q20, [x23, #0xa0]\n" + "str q19, [x23, #0xb0]\n" + "add x23, x23, %x[out_stride]\n" + "str q2, [x23, #0x0]\n" + "str q31, [x23, #0x10]\n" + "str q30, [x23, #0x20]\n" + "str q28, [x23, #0x30]\n" + "str q26, [x23, #0x40]\n" + "str q25, [x23, #0x50]\n" + "str q23, [x23, #0x60]\n" + "str q22, [x23, #0x70]\n" + "str q21, [x23, #0x80]\n" + "str q18, [x23, #0x90]\n" + "str q17, [x23, #0xa0]\n" + "str q16, [x23, #0xb0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q17, [x24], #0x10\n" + "shll v19.4s, v17.4h, #0x10\n" + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "shll2 v27.4s, v17.8h, #0x10\n" + "ldr q17, [x22], #0x10\n" + "cmp x19, #0xc\n" + "shll v26.4s, v16.4h, #0x10\n" + "ldr q16, [x21], #0x10\n" + "ldr q25, [x20], #0x10\n" + "shll v24.4s, v17.4h, #0x10\n" + "shll2 v23.4s, v17.8h, #0x10\n" + "ldr d18, [x22], #0x8\n" + "shll v22.4s, v16.4h, #0x10\n" + "ldr d17, [x21], #0x8\n" + "shll2 v21.4s, v16.8h, #0x10\n" + "ldr d16, [x20], #0x8\n" + "shll v20.4s, v25.4h, #0x10\n" + "str q19, [x23, #0x0]\n" + "shll v19.4s, v18.4h, #0x10\n" + "str q27, [x23, #0x10]\n" + "shll2 v18.4s, v25.8h, #0x10\n" + "str q26, [x23, #0x20]\n" + "shll v17.4s, v17.4h, #0x10\n" + "str q24, [x23, #0x30]\n" + "shll v16.4s, v16.4h, #0x10\n" + "str q23, [x23, #0x40]\n" + "str q19, [x23, #0x50]\n" + "str q22, [x23, #0x60]\n" + "str q21, [x23, #0x70]\n" + "str q17, [x23, #0x80]\n" + "str q20, [x23, #0x90]\n" + "str q18, [x23, #0xa0]\n" + "str q16, [x23, #0xb0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d16, [x24], #0x8\n" + "shll v19.4s, v16.4h, #0x10\n" + "ldr d16, [x22], #0x8\n" + "sub x19, x19, #0x4\n" + "shll v18.4s, v16.4h, #0x10\n" + "ldr d16, [x21], #0x8\n" + "cmp x19, #0x4\n" + "shll v17.4s, v16.4h, #0x10\n" + "ldr d16, [x20], #0x8\n" + "str q19, [x23, #0x0]\n" + "shll v16.4s, v16.4h, #0x10\n" + "str q18, [x23, #0x30]\n" + "str q17, [x23, #0x60]\n" + "str q16, [x23, #0x90]\n" + "add x23, x23, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h16, [x24], #0x2\n" + "shll v19.4s, v16.4h, #0x10\n" + "ldr h16, [x22], #0x2\n" + "sub x19, x19, #0x1\n" + "shll v18.4s, v16.4h, #0x10\n" + "ldr h16, [x21], #0x2\n" + "cmp x19, #0x1\n" + "shll v17.4s, v16.4h, #0x10\n" + "ldr h16, [x20], #0x2\n" + "str s19, [x23, #0x0]\n" + "shll v16.4s, v16.4h, #0x10\n" + "str s18, [x23, #0x30]\n" + "str s17, [x23, #0x60]\n" + "str s16, [x23, #0x90]\n" + "add x23, x23, #0x4\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0xc0\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q16, [x24], #0x10\n" + "shll v20.4s, v16.4h, #0x10\n" + "ldr q18, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "shll2 v17.4s, v16.8h, #0x10\n" + "ldr q19, [x24], #0x10\n" + "shll v16.4s, v18.4h, #0x10\n" + "cmp x19, #0x18\n" + "shll2 v18.4s, v18.8h, #0x10\n" + "str q20, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "shll v17.4s, v19.4h, #0x10\n" + "str q16, [x23, #0x20]\n" + "add x23, x23, %x[out_stride]\n" + "shll2 v16.4s, v19.8h, #0x10\n" + "str q18, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr q17, [x24], #0x10\n" + "shll v18.4s, v17.4h, #0x10\n" + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "shll2 v17.4s, v17.8h, #0x10\n" + "str q18, [x23, #0x0]\n" + "cmp x19, #0xc\n" + "shll v16.4s, v16.4h, #0x10\n" + "str q17, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "add x23, x23, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d16, [x24], #0x8\n" + "shll v16.4s, v16.4h, #0x10\n" + "str q16, [x23, #0x0]\n" + "sub x19, x19, #0x4\n" + "add x23, x23, #0x10\n" + "cmp x19, #0x4\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h16, [x24], #0x2\n" + "shll v16.4s, v16.4h, #0x10\n" + "str s16, [x23, #0x0]\n" + "sub x19, x19, #0x1\n" + "add x23, x23, #0x4\n" + "cmp x19, #0x1\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x30\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace +template<> +void Transform<12, 1, true, VLType::None>( + float *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_24_bf16fp32( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp new file mode 100644 index 0000000000..7bac8173e7 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 12 * height * sizeof(float); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q17, [x24], #0x10\n" + "fcvtl v3.4s, v17.4h\n" + "ldr q16, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "fcvtl2 v23.4s, v17.8h\n" + "ldr q17, [x22], #0x10\n" + "cmp x19, #0x18\n" + "fcvtl v22.4s, v16.4h\n" + "ldr q19, [x24], #0x10\n" + "fcvtl2 v2.4s, v16.8h\n" + "ldr q16, [x22], #0x10\n" + "fcvtl v21.4s, v17.4h\n" + "ldr q18, [x21], #0x10\n" + "fcvtl2 v1.4s, v17.8h\n" + "ldr q0, [x20], #0x10\n" + "fcvtl v31.4s, v19.4h\n" + "ldr q17, [x22], #0x10\n" + "fcvtl2 v30.4s, v19.8h\n" + "fcvtl v29.4s, v16.4h\n" + "ldr q20, [x21], #0x10\n" + "fcvtl2 v28.4s, v16.8h\n" + "ldr q27, [x20], #0x10\n" + "fcvtl v19.4s, v18.4h\n" + "ldr q16, [x21], #0x10\n" + "fcvtl v26.4s, v17.4h\n" + "fcvtl2 v25.4s, v17.8h\n" + "ldr q24, [x20], #0x10\n" + "fcvtl2 v18.4s, v18.8h\n" + "str q3, [x23, #0x0]\n" + "fcvtl v17.4s, v20.4h\n" + "str q23, [x23, #0x10]\n" + "fcvtl2 v23.4s, v20.8h\n" + "str q22, [x23, #0x20]\n" + "fcvtl v22.4s, v16.4h\n" + "str q21, [x23, #0x30]\n" + "fcvtl2 v21.4s, v16.8h\n" + "str q1, [x23, #0x40]\n" + "fcvtl v16.4s, v0.4h\n" + "str q29, [x23, #0x50]\n" + "fcvtl2 v20.4s, v0.8h\n" + "str q19, [x23, #0x60]\n" + "fcvtl v19.4s, v27.4h\n" + "str q18, [x23, #0x70]\n" + "fcvtl2 v18.4s, v27.8h\n" + "str q17, [x23, #0x80]\n" + "fcvtl v17.4s, v24.4h\n" + "str q16, [x23, #0x90]\n" + "fcvtl2 v16.4s, v24.8h\n" + "str q20, [x23, #0xa0]\n" + "str q19, [x23, #0xb0]\n" + "add x23, x23, %x[out_stride]\n" + "str q2, [x23, #0x0]\n" + "str q31, [x23, #0x10]\n" + "str q30, [x23, #0x20]\n" + "str q28, [x23, #0x30]\n" + "str q26, [x23, #0x40]\n" + "str q25, [x23, #0x50]\n" + "str q23, [x23, #0x60]\n" + "str q22, [x23, #0x70]\n" + "str q21, [x23, #0x80]\n" + "str q18, [x23, #0x90]\n" + "str q17, [x23, #0xa0]\n" + "str q16, [x23, #0xb0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q17, [x24], #0x10\n" + "fcvtl v19.4s, v17.4h\n" + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "fcvtl2 v27.4s, v17.8h\n" + "ldr q17, [x22], #0x10\n" + "cmp x19, #0xc\n" + "fcvtl v26.4s, v16.4h\n" + "ldr q16, [x21], #0x10\n" + "ldr q25, [x20], #0x10\n" + "fcvtl v24.4s, v17.4h\n" + "fcvtl2 v23.4s, v17.8h\n" + "ldr d18, [x22], #0x8\n" + "fcvtl v22.4s, v16.4h\n" + "ldr d17, [x21], #0x8\n" + "fcvtl2 v21.4s, v16.8h\n" + "ldr d16, [x20], #0x8\n" + "fcvtl v20.4s, v25.4h\n" + "str q19, [x23, #0x0]\n" + "fcvtl v19.4s, v18.4h\n" + "str q27, [x23, #0x10]\n" + "fcvtl2 v18.4s, v25.8h\n" + "str q26, [x23, #0x20]\n" + "fcvtl v17.4s, v17.4h\n" + "str q24, [x23, #0x30]\n" + "fcvtl v16.4s, v16.4h\n" + "str q23, [x23, #0x40]\n" + "str q19, [x23, #0x50]\n" + "str q22, [x23, #0x60]\n" + "str q21, [x23, #0x70]\n" + "str q17, [x23, #0x80]\n" + "str q20, [x23, #0x90]\n" + "str q18, [x23, #0xa0]\n" + "str q16, [x23, #0xb0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d16, [x24], #0x8\n" + "fcvtl v19.4s, v16.4h\n" + "ldr d16, [x22], #0x8\n" + "sub x19, x19, #0x4\n" + "fcvtl v18.4s, v16.4h\n" + "ldr d16, [x21], #0x8\n" + "cmp x19, #0x4\n" + "fcvtl v17.4s, v16.4h\n" + "ldr d16, [x20], #0x8\n" + "str q19, [x23, #0x0]\n" + "fcvtl v16.4s, v16.4h\n" + "str q18, [x23, #0x30]\n" + "str q17, [x23, #0x60]\n" + "str q16, [x23, #0x90]\n" + "add x23, x23, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h16, [x24], #0x2\n" + "fcvtl v19.4s, v16.4h\n" + "ldr h16, [x22], #0x2\n" + "sub x19, x19, #0x1\n" + "fcvtl v18.4s, v16.4h\n" + "ldr h16, [x21], #0x2\n" + "cmp x19, #0x1\n" + "fcvtl v17.4s, v16.4h\n" + "ldr h16, [x20], #0x2\n" + "str s19, [x23, #0x0]\n" + "fcvtl v16.4s, v16.4h\n" + "str s18, [x23, #0x30]\n" + "str s17, [x23, #0x60]\n" + "str s16, [x23, #0x90]\n" + "add x23, x23, #0x4\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0xc0\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Unroll column loop + "ldr q16, [x24], #0x10\n" + "fcvtl v20.4s, v16.4h\n" + "ldr q18, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "fcvtl2 v17.4s, v16.8h\n" + "ldr q19, [x24], #0x10\n" + "fcvtl v16.4s, v18.4h\n" + "cmp x19, #0x18\n" + "fcvtl2 v18.4s, v18.8h\n" + "str q20, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "fcvtl v17.4s, v19.4h\n" + "str q16, [x23, #0x20]\n" + "add x23, x23, %x[out_stride]\n" + "fcvtl2 v16.4s, v19.8h\n" + "str q18, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Unroll column loop skip + "cmp x19, #0xc\n" + "blt 15f\n" + "14:" // Tail row loop: Column loop + "ldr q17, [x24], #0x10\n" + "fcvtl v18.4s, v17.4h\n" + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0xc\n" + "fcvtl2 v17.4s, v17.8h\n" + "str q18, [x23, #0x0]\n" + "fcvtl v16.4s, v16.4h\n" + "str q17, [x23, #0x10]\n" + "cmp x19, #0xc\n" + "str q16, [x23, #0x20]\n" + "add x23, x23, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Column loop skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d16, [x24], #0x8\n" + "fcvtl v16.4s, v16.4h\n" + "str q16, [x23, #0x0]\n" + "sub x19, x19, #0x4\n" + "add x23, x23, #0x10\n" + "cmp x19, #0x4\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h16, [x24], #0x2\n" + "fcvtl v16.4s, v16.4h\n" + "str s16, [x23, #0x0]\n" + "sub x19, x19, #0x1\n" + "add x23, x23, #0x4\n" + "cmp x19, #0x1\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x30\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace +template<> +void Transform<12, 1, true, VLType::None>( + float *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_24_fp16fp32( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(__fp16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp deleted file mode 100644 index bcbe2b84d8..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 12x32-bit sized specialisation -template <> -template -inline void TransformImpl<12, 1, true, 4, 4, VLType::None>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 24 x uint16_t specialisation - TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 24x16-bit sized specialisation -template <> -template -inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 24 x uint16_t version -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR q2, [%[in0]], #16\n" - "STR q2, [%[out], #32]\n" - : [in0] "+r" (in0), [out] "+r" (out) - : - : "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR q2, [%[in0]], #16\n" - "LDP q3, q4, [%[in1]], #32\n" - "STP q2, q3, [%[out], #32]\n" - ASM_PREFETCH("[%[in1], #192]") - "LDR q5, [%[in1]], #16\n" - "STP q4, q5, [%[out], #64]\n" - : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - "LDR q2, [%[in0]], #16\n" - ASM_PREFETCH("[%[in0], #192]") - "LDP q3, q4, [%[in1]], #32\n" - "STP q2, q3, [%[out], #32]\n" - "LDR q5, [%[in1]], #16\n" - ASM_PREFETCH("[%[in1], #192]") - "STP q4, q5, [%[out], #64]\n" - "LDP q6, q7, [%[in2]], #32\n" - "STP q6, q7, [%[out], #96]\n" - "LDR q8, [%[in2]], #16\n" - ASM_PREFETCH("[%[in2], #192]") - "LDP q9, q10, [%[in3]], #32\n" - "STP q8, q9, [%[out], #128]\n" - "LDR q11, [%[in3]], #16\n" - "STP q10, q11, [%[out], #160]\n" - ASM_PREFETCH("[%[in3], #192]") - - : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __arch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp new file mode 100644 index 0000000000..912d512643 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_32_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 32 * roundup(height, 4) * sizeof(uint8_t); + + __asm__ __volatile__( + "cmp %x[height], #0x10\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x16, %x[in]\n" + "mov x15, %x[out]\n" + "add x14, x16, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "add x11, x12, %x[in_stride]\n" + "add x10, x11, %x[in_stride]\n" + "add x9, x10, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x10\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q14, [x16], #0x10\n" + "sub x19, x19, #0x20\n" + "ldr q7, [x14], #0x10\n" + "cmp x19, #0x20\n" + "ldr q1, [x13], #0x10\n" + "zip1 v0.16b, v14.16b, v1.16b\n" + "ldr q2, [x16], #0x10\n" + "zip2 v1.16b, v14.16b, v1.16b\n" + "ldr q26, [x14], #0x10\n" + "ldr q19, [x13], #0x10\n" + "zip1 v20.16b, v2.16b, v19.16b\n" + "ldr q15, [x12], #0x10\n" + "zip2 v14.16b, v2.16b, v19.16b\n" + "ldr q30, [x11], #0x10\n" + "ldr q29, [x10], #0x10\n" + "zip1 v8.16b, v7.16b, v15.16b\n" + "ldr q31, [x12], #0x10\n" + "zip2 v22.16b, v7.16b, v15.16b\n" + "ldr q28, [x11], #0x10\n" + "zip1 v25.16b, v0.16b, v8.16b\n" + "ldr q23, [x10], #0x10\n" + "zip2 v10.16b, v0.16b, v8.16b\n" + "ldr q27, [x9], #0x10\n" + "zip1 v4.16b, v1.16b, v22.16b\n" + "ldr q0, [x28], #0x10\n" + "zip2 v5.16b, v1.16b, v22.16b\n" + "ldr q13, [x27], #0x10\n" + "zip1 v12.16b, v26.16b, v31.16b\n" + "ldr q17, [x26], #0x10\n" + "zip1 v24.16b, v20.16b, v12.16b\n" + "ldr q18, [x9], #0x10\n" + "zip2 v12.16b, v20.16b, v12.16b\n" + "ldr q6, [x28], #0x10\n" + "zip2 v16.16b, v26.16b, v31.16b\n" + "ldr q15, [x27], #0x10\n" + "zip1 v22.16b, v14.16b, v16.16b\n" + "ldr q1, [x26], #0x10\n" + "zip2 v9.16b, v14.16b, v16.16b\n" + "ldr q8, [x25], #0x10\n" + "zip1 v26.16b, v30.16b, v27.16b\n" + "ldr q19, [x24], #0x10\n" + "zip1 v16.16b, v29.16b, v0.16b\n" + "ldr q7, [x23], #0x10\n" + "zip1 v11.16b, v26.16b, v16.16b\n" + "ldr q14, [x22], #0x10\n" + "zip2 v20.16b, v26.16b, v16.16b\n" + "ldr q16, [x25], #0x10\n" + "zip2 v26.16b, v30.16b, v27.16b\n" + "ldr q31, [x24], #0x10\n" + "zip2 v21.16b, v29.16b, v0.16b\n" + "ldr q0, [x23], #0x10\n" + "zip1 v30.16b, v26.16b, v21.16b\n" + "ldr q29, [x22], #0x10\n" + "zip2 v27.16b, v26.16b, v21.16b\n" + "ldr q3, [x21], #0x10\n" + "zip1 v21.16b, v28.16b, v18.16b\n" + "ldr q2, [x20], #0x10\n" + "zip1 v26.16b, v23.16b, v6.16b\n" + "zip2 v18.16b, v28.16b, v18.16b\n" + "ldr q28, [x21], #0x10\n" + "zip2 v23.16b, v23.16b, v6.16b\n" + "zip1 v6.16b, v21.16b, v26.16b\n" + "zip2 v21.16b, v21.16b, v26.16b\n" + "ldr q26, [x20], #0x10\n" + "str q25, [x15, #0x0]\n" + "zip1 v25.16b, v18.16b, v23.16b\n" + "zip2 v23.16b, v18.16b, v23.16b\n" + "str q10, [x15, #0x10]\n" + "zip1 v18.16b, v13.16b, v8.16b\n" + "str q4, [x15, #0x20]\n" + "zip1 v10.16b, v17.16b, v19.16b\n" + "str q5, [x15, #0x30]\n" + "zip1 v5.16b, v18.16b, v10.16b\n" + "str q24, [x15, #0x40]\n" + "zip2 v24.16b, v18.16b, v10.16b\n" + "str q12, [x15, #0x50]\n" + "zip2 v18.16b, v13.16b, v8.16b\n" + "str q22, [x15, #0x60]\n" + "zip2 v17.16b, v17.16b, v19.16b\n" + "str q9, [x15, #0x70]\n" + "zip1 v9.16b, v18.16b, v17.16b\n" + "str q11, [x15, #0x80]\n" + "zip2 v12.16b, v18.16b, v17.16b\n" + "str q20, [x15, #0x90]\n" + "zip1 v20.16b, v15.16b, v16.16b\n" + "str q30, [x15, #0xa0]\n" + "zip1 v17.16b, v1.16b, v31.16b\n" + "str q27, [x15, #0xb0]\n" + "zip1 v19.16b, v20.16b, v17.16b\n" + "str q6, [x15, #0xc0]\n" + "zip2 v18.16b, v20.16b, v17.16b\n" + "str q21, [x15, #0xd0]\n" + "zip2 v17.16b, v15.16b, v16.16b\n" + "str q25, [x15, #0xe0]\n" + "zip2 v16.16b, v1.16b, v31.16b\n" + "str q23, [x15, #0xf0]\n" + "zip1 v22.16b, v17.16b, v16.16b\n" + "str q5, [x15, #0x100]\n" + "zip2 v21.16b, v17.16b, v16.16b\n" + "str q24, [x15, #0x110]\n" + "zip1 v17.16b, v7.16b, v3.16b\n" + "str q9, [x15, #0x120]\n" + "zip1 v16.16b, v14.16b, v2.16b\n" + "str q12, [x15, #0x130]\n" + "zip1 v20.16b, v17.16b, v16.16b\n" + "str q19, [x15, #0x140]\n" + "zip2 v19.16b, v17.16b, v16.16b\n" + "str q18, [x15, #0x150]\n" + "zip2 v18.16b, v7.16b, v3.16b\n" + "str q22, [x15, #0x160]\n" + "zip2 v16.16b, v14.16b, v2.16b\n" + "str q21, [x15, #0x170]\n" + "zip1 v17.16b, v18.16b, v16.16b\n" + "str q20, [x15, #0x180]\n" + "zip2 v16.16b, v18.16b, v16.16b\n" + "str q19, [x15, #0x190]\n" + "zip1 v18.16b, v0.16b, v28.16b\n" + "str q17, [x15, #0x1a0]\n" + "zip1 v17.16b, v29.16b, v26.16b\n" + "str q16, [x15, #0x1b0]\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x1c0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x1d0]\n" + "zip2 v18.16b, v0.16b, v28.16b\n" + "zip2 v17.16b, v29.16b, v26.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x1e0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x1f0]\n" + "add x15, x15, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: width 16 loop: loop + "ldr q18, [x16], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q20, [x14], #0x10\n" + "cmp x19, #0x10\n" + "ldr q17, [x13], #0x10\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr q16, [x12], #0x10\n" + "zip2 v18.16b, v18.16b, v17.16b\n" + "ldr q3, [x11], #0x10\n" + "ldr q2, [x10], #0x10\n" + "zip1 v17.16b, v20.16b, v16.16b\n" + "ldr q1, [x9], #0x10\n" + "zip2 v16.16b, v20.16b, v16.16b\n" + "ldr q0, [x28], #0x10\n" + "zip1 v31.16b, v19.16b, v17.16b\n" + "ldr q30, [x27], #0x10\n" + "zip2 v20.16b, v19.16b, v17.16b\n" + "ldr q29, [x26], #0x10\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr q28, [x25], #0x10\n" + "zip2 v18.16b, v18.16b, v16.16b\n" + "ldr q27, [x24], #0x10\n" + "zip1 v17.16b, v3.16b, v1.16b\n" + "ldr q26, [x23], #0x10\n" + "zip1 v16.16b, v2.16b, v0.16b\n" + "ldr q25, [x22], #0x10\n" + "zip1 v24.16b, v17.16b, v16.16b\n" + "ldr q23, [x21], #0x10\n" + "zip2 v22.16b, v17.16b, v16.16b\n" + "ldr q21, [x20], #0x10\n" + "zip2 v17.16b, v3.16b, v1.16b\n" + "str q31, [x15, #0x0]\n" + "zip2 v16.16b, v2.16b, v0.16b\n" + "str q20, [x15, #0x10]\n" + "zip1 v20.16b, v17.16b, v16.16b\n" + "str q19, [x15, #0x20]\n" + "zip2 v19.16b, v17.16b, v16.16b\n" + "str q18, [x15, #0x30]\n" + "zip1 v18.16b, v30.16b, v28.16b\n" + "str q24, [x15, #0x80]\n" + "zip1 v16.16b, v29.16b, v27.16b\n" + "str q22, [x15, #0x90]\n" + "zip1 v17.16b, v18.16b, v16.16b\n" + "str q20, [x15, #0xa0]\n" + "zip2 v16.16b, v18.16b, v16.16b\n" + "str q19, [x15, #0xb0]\n" + "zip2 v18.16b, v30.16b, v28.16b\n" + "str q17, [x15, #0x100]\n" + "zip2 v17.16b, v29.16b, v27.16b\n" + "str q16, [x15, #0x110]\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x120]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x130]\n" + "zip1 v18.16b, v26.16b, v23.16b\n" + "zip1 v17.16b, v25.16b, v21.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x180]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x190]\n" + "zip2 v18.16b, v26.16b, v23.16b\n" + "zip2 v17.16b, v25.16b, v21.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x1a0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x1b0]\n" + "add x15, x15, #0x40\n" + "bge 4b\n" + "5:" // Main row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr s18, [x16], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s17, [x14], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x13], #0x4\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr s16, [x12], #0x4\n" + "ldr s18, [x11], #0x4\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr s20, [x10], #0x4\n" + "ldr s17, [x9], #0x4\n" + "zip1 v23.16b, v19.16b, v16.16b\n" + "ldr s16, [x28], #0x4\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr s18, [x27], #0x4\n" + "ldr s22, [x26], #0x4\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "ldr s17, [x25], #0x4\n" + "zip1 v21.16b, v19.16b, v16.16b\n" + "ldr s16, [x24], #0x4\n" + "zip1 v18.16b, v18.16b, v17.16b\n" + "ldr s20, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "ldr s17, [x21], #0x4\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr s16, [x20], #0x4\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str q23, [x15, #0x0]\n" + "str q21, [x15, #0x80]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "str q18, [x15, #0x100]\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x15, #0x180]\n" + "add x15, x15, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr b18, [x16], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b17, [x14], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x13], #0x1\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr b16, [x12], #0x1\n" + "ldr b18, [x11], #0x1\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr b20, [x10], #0x1\n" + "ldr b17, [x9], #0x1\n" + "zip1 v23.16b, v19.16b, v16.16b\n" + "ldr b16, [x28], #0x1\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr b18, [x27], #0x1\n" + "ldr b22, [x26], #0x1\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "ldr b17, [x25], #0x1\n" + "zip1 v21.16b, v19.16b, v16.16b\n" + "ldr b16, [x24], #0x1\n" + "zip1 v18.16b, v18.16b, v17.16b\n" + "ldr b20, [x23], #0x1\n" + "ldr b19, [x22], #0x1\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "ldr b17, [x21], #0x1\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr b16, [x20], #0x1\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str s23, [x15, #0x0]\n" + "str s21, [x15, #0x80]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "str s18, [x15, #0x100]\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x15, #0x180]\n" + "add x15, x15, #0x4\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x200\n" + "cmp %x[height], #0x10\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x16, %x[in]\n" + "mov x15, %x[out]\n" + "add x14, x16, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "add %x[in], x12, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x12, x12, %x[pad_row], GT\n" + "csel x13, x13, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x14, x14, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr q17, [x16], #0x10\n" + "sub x19, x19, #0x20\n" + "ldr q25, [x14], #0x10\n" + "cmp x19, #0x20\n" + "ldr q16, [x13], #0x10\n" + "zip1 v24.16b, v17.16b, v16.16b\n" + "ldr q18, [x16], #0x10\n" + "zip2 v23.16b, v17.16b, v16.16b\n" + "ldr q22, [x14], #0x10\n" + "ldr q17, [x13], #0x10\n" + "zip1 v21.16b, v18.16b, v17.16b\n" + "ldr q16, [x12], #0x10\n" + "zip2 v20.16b, v18.16b, v17.16b\n" + "ldr q19, [x12], #0x10\n" + "zip1 v18.16b, v25.16b, v16.16b\n" + "zip2 v17.16b, v25.16b, v16.16b\n" + "zip1 v16.16b, v24.16b, v18.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v24.16b, v18.16b\n" + "str q16, [x15, #0x10]\n" + "zip1 v16.16b, v23.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v23.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "zip1 v17.16b, v22.16b, v19.16b\n" + "zip1 v16.16b, v21.16b, v17.16b\n" + "str q16, [x15, #0x40]\n" + "zip2 v16.16b, v21.16b, v17.16b\n" + "str q16, [x15, #0x50]\n" + "zip2 v17.16b, v22.16b, v19.16b\n" + "zip1 v16.16b, v20.16b, v17.16b\n" + "str q16, [x15, #0x60]\n" + "zip2 v16.16b, v20.16b, v17.16b\n" + "str q16, [x15, #0x70]\n" + "add x15, x15, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 15f\n" + "14:" // Tail row loop: width 16 loop: loop + "ldr q18, [x16], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q21, [x14], #0x10\n" + "cmp x19, #0x10\n" + "ldr q17, [x13], #0x10\n" + "zip1 v20.16b, v18.16b, v17.16b\n" + "ldr q16, [x12], #0x10\n" + "zip2 v19.16b, v18.16b, v17.16b\n" + "zip1 v18.16b, v21.16b, v16.16b\n" + "zip2 v17.16b, v21.16b, v16.16b\n" + "zip1 v16.16b, v20.16b, v18.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v20.16b, v18.16b\n" + "str q16, [x15, #0x10]\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "bge 14b\n" + "15:" // Tail row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr s17, [x16], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s18, [x14], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x13], #0x4\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr s16, [x12], #0x4\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr b17, [x16], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b18, [x14], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x13], #0x1\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr b16, [x12], #0x1\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x15, #0x0]\n" + "add x15, x15, #0x4\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x80\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<32, 4, true, VLType::None>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_32_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<32, 4, true, VLType::None>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_32_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp new file mode 100644 index 0000000000..05e68daba1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 32 * roundup(height, 2) * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 12f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[width]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x19, x21, %x[in_stride]\n" + "cmp x23, #0x40\n" + "add %x[in], x19, %x[in_stride]\n" + "mov x20, %x[out]\n" + "sub %x[height], %x[height], #0x4\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q14, [x24], #0x10\n" + "ldr q10, [x22], #0x10\n" + "sub x23, x23, #0x40\n" + "zip1 v12.8h, v14.8h, v10.8h\n" + "ldr q5, [x21], #0x10\n" + "ldr q3, [x19], #0x10\n" + "zip2 v31.8h, v14.8h, v10.8h\n" + "zip1 v19.8h, v5.8h, v3.8h\n" + "ldr q27, [x24], #0x10\n" + "ldr q25, [x22], #0x10\n" + "zip1 v11.8h, v27.8h, v25.8h\n" + "zip2 v24.8h, v27.8h, v25.8h\n" + "ldr q6, [x21], #0x10\n" + "ldr q29, [x19], #0x10\n" + "zip2 v15.8h, v5.8h, v3.8h\n" + "zip1 v18.8h, v6.8h, v29.8h\n" + "ldr q17, [x24], #0x10\n" + "ldr q9, [x22], #0x10\n" + "zip1 v0.8h, v17.8h, v9.8h\n" + "zip2 v9.8h, v17.8h, v9.8h\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x19], #0x10\n" + "zip2 v8.8h, v6.8h, v29.8h\n" + "zip1 v30.8h, v21.8h, v20.8h\n" + "ldr q17, [x24], #0x10\n" + "ldr q5, [x22], #0x10\n" + "zip1 v13.8h, v17.8h, v5.8h\n" + "zip2 v25.8h, v17.8h, v5.8h\n" + "ldr q7, [x21], #0x10\n" + "ldr q29, [x19], #0x10\n" + "zip2 v27.8h, v21.8h, v20.8h\n" + "zip1 v14.8h, v7.8h, v29.8h\n" + "ldr q28, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "zip2 v1.8h, v7.8h, v29.8h\n" + "cmp x23, #0x40\n" + "ldr q10, [x21], #0x10\n" + "ldr q21, [x19], #0x10\n" + "zip1 v16.8h, v28.8h, v17.8h\n" + "zip2 v17.8h, v28.8h, v17.8h\n" + "ldr q5, [x24], #0x10\n" + "ldr q20, [x22], #0x10\n" + "zip1 v3.8h, v5.8h, v20.8h\n" + "zip2 v7.8h, v5.8h, v20.8h\n" + "ldr q22, [x21], #0x10\n" + "ldr q29, [x19], #0x10\n" + "zip1 v2.8h, v10.8h, v21.8h\n" + "zip2 v5.8h, v10.8h, v21.8h\n" + "ldr q21, [x24], #0x10\n" + "ldr q20, [x22], #0x10\n" + "zip1 v4.8h, v21.8h, v20.8h\n" + "zip2 v28.8h, v21.8h, v20.8h\n" + "ldr q6, [x21], #0x10\n" + "ldr q10, [x19], #0x10\n" + "zip1 v26.8h, v22.8h, v29.8h\n" + "zip2 v20.8h, v22.8h, v29.8h\n" + "ldr q29, [x24], #0x10\n" + "ldr q23, [x22], #0x10\n" + "zip1 v21.8h, v29.8h, v23.8h\n" + "zip2 v23.8h, v29.8h, v23.8h\n" + "ldr q22, [x21], #0x10\n" + "ldr q29, [x19], #0x10\n" + "str q12, [x20, #0x0]\n" + "zip1 v12.8h, v6.8h, v10.8h\n" + "str q31, [x20, #0x10]\n" + "zip2 v6.8h, v6.8h, v10.8h\n" + "zip1 v31.8h, v22.8h, v29.8h\n" + "str q11, [x20, #0x20]\n" + "zip2 v11.8h, v22.8h, v29.8h\n" + "str q24, [x20, #0x30]\n" + "str q0, [x20, #0x40]\n" + "str q9, [x20, #0x50]\n" + "str q13, [x20, #0x60]\n" + "str q25, [x20, #0x70]\n" + "str q19, [x20, #0x80]\n" + "str q15, [x20, #0x90]\n" + "str q18, [x20, #0xa0]\n" + "str q8, [x20, #0xb0]\n" + "str q30, [x20, #0xc0]\n" + "str q27, [x20, #0xd0]\n" + "str q14, [x20, #0xe0]\n" + "str q1, [x20, #0xf0]\n" + "add x20, x20, %x[out_stride]\n" + "str q16, [x20, #0x0]\n" + "str q17, [x20, #0x10]\n" + "str q3, [x20, #0x20]\n" + "str q7, [x20, #0x30]\n" + "str q4, [x20, #0x40]\n" + "str q28, [x20, #0x50]\n" + "str q21, [x20, #0x60]\n" + "str q23, [x20, #0x70]\n" + "str q2, [x20, #0x80]\n" + "str q5, [x20, #0x90]\n" + "str q26, [x20, #0xa0]\n" + "str q20, [x20, #0xb0]\n" + "str q12, [x20, #0xc0]\n" + "str q6, [x20, #0xd0]\n" + "str q31, [x20, #0xe0]\n" + "str q11, [x20, #0xf0]\n" + "add x20, x20, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x23, #0x20\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q17, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "sub x23, x23, #0x20\n" + "cmp x23, #0x20\n" + "ldr q21, [x21], #0x10\n" + "ldr q18, [x19], #0x10\n" + "zip1 v1.8h, v17.8h, v16.8h\n" + "zip2 v0.8h, v17.8h, v16.8h\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v31.8h, v17.8h, v16.8h\n" + "zip2 v30.8h, v17.8h, v16.8h\n" + "ldr q20, [x21], #0x10\n" + "ldr q19, [x19], #0x10\n" + "zip1 v29.8h, v21.8h, v18.8h\n" + "zip2 v28.8h, v21.8h, v18.8h\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v27.8h, v17.8h, v16.8h\n" + "zip2 v26.8h, v17.8h, v16.8h\n" + "ldr q25, [x21], #0x10\n" + "ldr q18, [x19], #0x10\n" + "zip1 v24.8h, v20.8h, v19.8h\n" + "zip2 v23.8h, v20.8h, v19.8h\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v22.8h, v17.8h, v16.8h\n" + "zip2 v21.8h, v17.8h, v16.8h\n" + "ldr q20, [x21], #0x10\n" + "ldr q16, [x19], #0x10\n" + "zip1 v19.8h, v25.8h, v18.8h\n" + "zip2 v18.8h, v25.8h, v18.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "str q1, [x20, #0x0]\n" + "str q0, [x20, #0x10]\n" + "str q31, [x20, #0x20]\n" + "str q30, [x20, #0x30]\n" + "str q27, [x20, #0x40]\n" + "str q26, [x20, #0x50]\n" + "str q22, [x20, #0x60]\n" + "str q21, [x20, #0x70]\n" + "str q29, [x20, #0x80]\n" + "str q28, [x20, #0x90]\n" + "str q24, [x20, #0xa0]\n" + "str q23, [x20, #0xb0]\n" + "str q19, [x20, #0xc0]\n" + "str q18, [x20, #0xd0]\n" + "str q17, [x20, #0xe0]\n" + "str q16, [x20, #0xf0]\n" + "add x20, x20, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x23, #0x10\n" + "blt 7f\n" + "6:" // Main row loop: width 16 loop: loop + "ldr q17, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "sub x23, x23, #0x10\n" + "cmp x23, #0x10\n" + "ldr q24, [x21], #0x10\n" + "ldr q23, [x19], #0x10\n" + "zip1 v19.8h, v17.8h, v16.8h\n" + "zip2 v18.8h, v17.8h, v16.8h\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v22.8h, v17.8h, v16.8h\n" + "zip2 v21.8h, v17.8h, v16.8h\n" + "ldr q20, [x21], #0x10\n" + "ldr q16, [x19], #0x10\n" + "str q19, [x20, #0x0]\n" + "zip1 v19.8h, v24.8h, v23.8h\n" + "str q18, [x20, #0x10]\n" + "zip2 v18.8h, v24.8h, v23.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "str q22, [x20, #0x20]\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "str q21, [x20, #0x30]\n" + "str q19, [x20, #0x80]\n" + "str q18, [x20, #0x90]\n" + "str q17, [x20, #0xa0]\n" + "str q16, [x20, #0xb0]\n" + "add x20, x20, #0x40\n" + "bge 6b\n" + "7:" // Main row loop: width 16 loop: skip + "cmp x23, #0x4\n" + "blt 9f\n" + "8:" // Main row loop: width 4 loop: loop + "ldr d19, [x24], #0x8\n" + "ldr d16, [x22], #0x8\n" + "sub x23, x23, #0x4\n" + "cmp x23, #0x4\n" + "ldr d18, [x21], #0x8\n" + "ldr d17, [x19], #0x8\n" + "zip1 v16.8h, v19.8h, v16.8h\n" + "str q16, [x20, #0x0]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [x20, #0x80]\n" + "add x20, x20, #0x10\n" + "bge 8b\n" + "9:" // Main row loop: width 4 loop: skip + "cmp x23, #0x1\n" + "blt 11f\n" + "10:" // Main row loop: width 1 loop: loop + "ldr h19, [x24], #0x2\n" + "ldr h16, [x22], #0x2\n" + "sub x23, x23, #0x1\n" + "cmp x23, #0x1\n" + "ldr h18, [x21], #0x2\n" + "ldr h17, [x19], #0x2\n" + "zip1 v16.8h, v19.8h, v16.8h\n" + "str s16, [x20, #0x0]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str s16, [x20, #0x80]\n" + "add x20, x20, #0x4\n" + "bge 10b\n" + "11:" // Main row loop: width 1 loop: skip + "cmp %x[height], #0x4\n" + "add %x[out], %x[out], #0x100\n" + "bge 1b\n" + "cbz %x[height], 24f\n" + "12:" // Main loop skip + + "13:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x19, %x[width]\n" + "add x22, x24, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "add %x[in], x22, %x[in_stride]\n" + "csel x22, x22, %x[pad_row], GT\n" + "cmp x19, #0x40\n" + "mov x20, %x[out]\n" + "sub %x[height], %x[height], #0x2\n" + "blt 15f\n" + "14:" // Tail row loop: Unroll column loop + "ldr q18, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "sub x19, x19, #0x40\n" + "zip1 v0.8h, v18.8h, v17.8h\n" + "ldr q19, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip2 v31.8h, v18.8h, v17.8h\n" + "zip1 v30.8h, v19.8h, v16.8h\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "zip2 v29.8h, v19.8h, v16.8h\n" + "zip1 v28.8h, v18.8h, v17.8h\n" + "ldr q19, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip2 v27.8h, v18.8h, v17.8h\n" + "zip1 v26.8h, v19.8h, v16.8h\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "zip2 v25.8h, v19.8h, v16.8h\n" + "cmp x19, #0x40\n" + "ldr q19, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v24.8h, v18.8h, v17.8h\n" + "zip2 v23.8h, v18.8h, v17.8h\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "zip1 v22.8h, v19.8h, v16.8h\n" + "zip2 v21.8h, v19.8h, v16.8h\n" + "ldr q20, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "str q0, [x20, #0x0]\n" + "zip1 v19.8h, v18.8h, v17.8h\n" + "str q31, [x20, #0x10]\n" + "zip2 v18.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "str q30, [x20, #0x20]\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "str q29, [x20, #0x30]\n" + "str q28, [x20, #0x40]\n" + "str q27, [x20, #0x50]\n" + "str q26, [x20, #0x60]\n" + "str q25, [x20, #0x70]\n" + "add x20, x20, %x[out_stride]\n" + "str q24, [x20, #0x0]\n" + "str q23, [x20, #0x10]\n" + "str q22, [x20, #0x20]\n" + "str q21, [x20, #0x30]\n" + "str q19, [x20, #0x40]\n" + "str q18, [x20, #0x50]\n" + "str q17, [x20, #0x60]\n" + "str q16, [x20, #0x70]\n" + "add x20, x20, %x[out_stride]\n" + "bge 14b\n" + "15:" // Tail row loop: Unroll column loop skip + "cmp x19, #0x20\n" + "blt 17f\n" + "16:" // Tail row loop: Column loop + "ldr q18, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "sub x19, x19, #0x20\n" + "cmp x19, #0x20\n" + "ldr q19, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v24.8h, v18.8h, v17.8h\n" + "zip2 v23.8h, v18.8h, v17.8h\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "zip1 v22.8h, v19.8h, v16.8h\n" + "zip2 v21.8h, v19.8h, v16.8h\n" + "ldr q20, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v19.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "str q24, [x20, #0x0]\n" + "str q23, [x20, #0x10]\n" + "str q22, [x20, #0x20]\n" + "str q21, [x20, #0x30]\n" + "str q19, [x20, #0x40]\n" + "str q18, [x20, #0x50]\n" + "str q17, [x20, #0x60]\n" + "str q16, [x20, #0x70]\n" + "add x20, x20, %x[out_stride]\n" + "bge 16b\n" + "17:" // Tail row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 19f\n" + "18:" // Tail row loop: width 16 loop: loop + "ldr q18, [x24], #0x10\n" + "ldr q17, [x22], #0x10\n" + "sub x19, x19, #0x10\n" + "cmp x19, #0x10\n" + "ldr q20, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + "zip1 v19.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "str q19, [x20, #0x0]\n" + "str q18, [x20, #0x10]\n" + "str q17, [x20, #0x20]\n" + "str q16, [x20, #0x30]\n" + "add x20, x20, #0x40\n" + "bge 18b\n" + "19:" // Tail row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 21f\n" + "20:" // Tail row loop: width 4 loop: loop + "ldr d17, [x24], #0x8\n" + "ldr d16, [x22], #0x8\n" + "sub x19, x19, #0x4\n" + "cmp x19, #0x4\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q16, [x20, #0x0]\n" + "add x20, x20, #0x10\n" + "bge 20b\n" + "21:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 23f\n" + "22:" // Tail row loop: width 1 loop: loop + "ldr h17, [x24], #0x2\n" + "ldr h16, [x22], #0x2\n" + "sub x19, x19, #0x1\n" + "cmp x19, #0x1\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str s16, [x20, #0x0]\n" + "add x20, x20, #0x4\n" + "bge 22b\n" + "23:" // Tail row loop: width 1 loop: skip + "cmp %x[height], #0x1\n" + "add %x[out], %x[out], #0x80\n" + "bge 13b\n" + "24:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace + +template<> +void Transform<32, 2, true, VLType::None>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_32_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp new file mode 100644 index 0000000000..4f7019f564 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 24 * height * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q27, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q26, [x22], #0x10\n" + "cmp x19, #0x18\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x22], #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q27, [x23, #0x0]\n" + "str q23, [x23, #0x10]\n" + "str q19, [x23, #0x20]\n" + "str q26, [x23, #0x30]\n" + "str q22, [x23, #0x40]\n" + "str q18, [x23, #0x50]\n" + "str q25, [x23, #0x60]\n" + "str q21, [x23, #0x70]\n" + "str q17, [x23, #0x80]\n" + "str q24, [x23, #0x90]\n" + "str q20, [x23, #0xa0]\n" + "str q16, [x23, #0xb0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: width 16 loop: loop + "ldr q23, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q22, [x22], #0x10\n" + "cmp x19, #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q23, [x23, #0x0]\n" + "str q19, [x23, #0x10]\n" + "str q22, [x23, #0x30]\n" + "str q18, [x23, #0x40]\n" + "str q21, [x23, #0x60]\n" + "str q17, [x23, #0x70]\n" + "str q20, [x23, #0x90]\n" + "str q16, [x23, #0xa0]\n" + "add x23, x23, #0x20\n" + "bge 4b\n" + "5:" // Main row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d19, [x24], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d18, [x22], #0x8\n" + "cmp x19, #0x4\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "str d19, [x23, #0x0]\n" + "str d18, [x23, #0x30]\n" + "str d17, [x23, #0x60]\n" + "str d16, [x23, #0x90]\n" + "add x23, x23, #0x8\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h19, [x24], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h18, [x22], #0x2\n" + "cmp x19, #0x1\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "str h19, [x23, #0x0]\n" + "str h18, [x23, #0x30]\n" + "str h17, [x23, #0x60]\n" + "str h16, [x23, #0x90]\n" + "add x23, x23, #0x2\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0xc0\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr q18, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "cmp x19, #0x18\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q18, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 15f\n" + "14:" // Tail row loop: width 16 loop: loop + "ldr q17, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q17, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "add x23, x23, #0x20\n" + "bge 14b\n" + "15:" // Tail row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0x4\n" + "cmp x19, #0x4\n" + "str d16, [x23, #0x0]\n" + "add x23, x23, #0x8\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h16, [x24], #0x2\n" + "sub x19, x19, #0x1\n" + "cmp x19, #0x1\n" + "str h16, [x23, #0x0]\n" + "add x23, x23, #0x2\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x30\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace + +template<> +void Transform<12, 1, true, VLType::None>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_48( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 2, + stride * sizeof(float), + (kmax-k0) + ); +} + +template<> +void Transform<24, 1, true, VLType::None>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_48( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + +template<> +void Transform<6, 1, true, VLType::None>( + double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_48( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(double) / 2, + stride * sizeof(double), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp new file mode 100644 index 0000000000..cb20172364 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 16) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 4 * roundup(height, 16) * sizeof(uint8_t); + + __asm__ __volatile__( + + "1:" // Main row loop: Head + "mov x16, %x[in]\n" + "mov x15, %x[out]\n" + "add x14, x16, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "add x11, x12, %x[in_stride]\n" + "add x10, x11, %x[in_stride]\n" + "add x9, x10, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "cmp %x[height], #0xf\n" + "csel x20, x20, %x[pad_row], GT\n" + "csel x21, x21, %x[pad_row], GE\n" + "cmp %x[height], #0xd\n" + "csel x22, x22, %x[pad_row], GT\n" + "csel x23, x23, %x[pad_row], GE\n" + "cmp %x[height], #0xb\n" + "csel x24, x24, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x9\n" + "csel x26, x26, %x[pad_row], GT\n" + "csel x27, x27, %x[pad_row], GE\n" + "cmp %x[height], #0x7\n" + "csel x28, x28, %x[pad_row], GT\n" + "csel x9, x9, %x[pad_row], GE\n" + "cmp %x[height], #0x5\n" + "csel x10, x10, %x[pad_row], GT\n" + "csel x11, x11, %x[pad_row], GE\n" + "cmp %x[height], #0x3\n" + "csel x12, x12, %x[pad_row], GT\n" + "csel x13, x13, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x14, x14, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x10\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q20, [x16], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q19, [x14], #0x10\n" + "cmp x19, #0x10\n" + "ldr q18, [x13], #0x10\n" + "ldr q11, [x12], #0x10\n" + "ldr q10, [x11], #0x10\n" + "ldr q9, [x10], #0x10\n" + "ldr q8, [x9], #0x10\n" + "ldr q7, [x28], #0x10\n" + "ldr q16, [x27], #0x10\n" + "zip1 v6.16b, v20.16b, v16.16b\n" + "ldr q17, [x26], #0x10\n" + "zip2 v5.16b, v20.16b, v16.16b\n" + "ldr q16, [x25], #0x10\n" + "ldr q4, [x24], #0x10\n" + "zip1 v3.16b, v19.16b, v17.16b\n" + "ldr q2, [x23], #0x10\n" + "zip2 v1.16b, v19.16b, v17.16b\n" + "ldr q0, [x22], #0x10\n" + "zip1 v31.16b, v18.16b, v16.16b\n" + "ldr q30, [x21], #0x10\n" + "zip1 v27.16b, v11.16b, v4.16b\n" + "ldr q29, [x20], #0x10\n" + "zip2 v28.16b, v18.16b, v16.16b\n" + "zip1 v26.16b, v10.16b, v2.16b\n" + "zip1 v22.16b, v6.16b, v26.16b\n" + "zip1 v25.16b, v8.16b, v30.16b\n" + "zip1 v21.16b, v31.16b, v25.16b\n" + "zip1 v18.16b, v22.16b, v21.16b\n" + "zip1 v24.16b, v9.16b, v0.16b\n" + "zip1 v20.16b, v3.16b, v24.16b\n" + "zip1 v23.16b, v7.16b, v29.16b\n" + "zip1 v19.16b, v27.16b, v23.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x10]\n" + "zip2 v18.16b, v22.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v22.16b, v6.16b, v26.16b\n" + "zip2 v21.16b, v31.16b, v25.16b\n" + "zip1 v18.16b, v22.16b, v21.16b\n" + "zip2 v20.16b, v3.16b, v24.16b\n" + "zip2 v19.16b, v27.16b, v23.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x10]\n" + "zip2 v18.16b, v22.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v27.16b, v10.16b, v2.16b\n" + "zip2 v26.16b, v8.16b, v30.16b\n" + "zip1 v22.16b, v5.16b, v27.16b\n" + "zip1 v21.16b, v28.16b, v26.16b\n" + "zip1 v18.16b, v22.16b, v21.16b\n" + "zip2 v25.16b, v9.16b, v0.16b\n" + "zip1 v20.16b, v1.16b, v25.16b\n" + "zip2 v24.16b, v11.16b, v4.16b\n" + "zip2 v23.16b, v7.16b, v29.16b\n" + "zip1 v19.16b, v24.16b, v23.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x10]\n" + "zip2 v18.16b, v22.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v22.16b, v5.16b, v27.16b\n" + "zip2 v21.16b, v28.16b, v26.16b\n" + "zip1 v18.16b, v22.16b, v21.16b\n" + "zip2 v20.16b, v1.16b, v25.16b\n" + "zip2 v19.16b, v24.16b, v23.16b\n" + "zip1 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x10]\n" + "zip2 v18.16b, v22.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0x4\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr s17, [x16], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s20, [x14], #0x4\n" + "cmp x19, #0x4\n" + "ldr s18, [x13], #0x4\n" + "ldr s19, [x12], #0x4\n" + "ldr s27, [x11], #0x4\n" + "ldr s22, [x10], #0x4\n" + "ldr s26, [x9], #0x4\n" + "ldr s25, [x28], #0x4\n" + "ldr s16, [x27], #0x4\n" + "zip1 v21.16b, v17.16b, v16.16b\n" + "ldr s17, [x26], #0x4\n" + "ldr s16, [x25], #0x4\n" + "zip1 v24.16b, v18.16b, v16.16b\n" + "ldr s18, [x24], #0x4\n" + "zip1 v20.16b, v20.16b, v17.16b\n" + "ldr s17, [x23], #0x4\n" + "ldr s16, [x22], #0x4\n" + "zip1 v23.16b, v19.16b, v18.16b\n" + "ldr s18, [x21], #0x4\n" + "ldr s19, [x20], #0x4\n" + "zip1 v17.16b, v27.16b, v17.16b\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "zip1 v22.16b, v21.16b, v17.16b\n" + "zip1 v21.16b, v20.16b, v16.16b\n" + "zip1 v16.16b, v26.16b, v18.16b\n" + "zip1 v20.16b, v24.16b, v16.16b\n" + "zip1 v18.16b, v22.16b, v20.16b\n" + "zip1 v16.16b, v25.16b, v19.16b\n" + "zip1 v19.16b, v23.16b, v16.16b\n" + "zip1 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x10]\n" + "zip2 v18.16b, v22.16b, v20.16b\n" + "zip2 v17.16b, v21.16b, v19.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x1\n" + "blt 7f\n" + "6:" // Main row loop: width 1 loop: loop + "ldr b17, [x16], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b21, [x14], #0x1\n" + "cmp x19, #0x1\n" + "ldr b18, [x13], #0x1\n" + "ldr b20, [x12], #0x1\n" + "ldr b27, [x11], #0x1\n" + "ldr b26, [x10], #0x1\n" + "ldr b25, [x9], #0x1\n" + "ldr b24, [x28], #0x1\n" + "ldr b16, [x27], #0x1\n" + "zip1 v23.16b, v17.16b, v16.16b\n" + "ldr b17, [x26], #0x1\n" + "ldr b16, [x25], #0x1\n" + "zip1 v22.16b, v18.16b, v16.16b\n" + "ldr b19, [x24], #0x1\n" + "zip1 v18.16b, v21.16b, v17.16b\n" + "ldr b17, [x23], #0x1\n" + "ldr b16, [x22], #0x1\n" + "zip1 v21.16b, v20.16b, v19.16b\n" + "ldr b20, [x21], #0x1\n" + "ldr b19, [x20], #0x1\n" + "zip1 v17.16b, v27.16b, v17.16b\n" + "zip1 v16.16b, v26.16b, v16.16b\n" + "zip1 v17.16b, v23.16b, v17.16b\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v25.16b, v20.16b\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "zip1 v16.16b, v24.16b, v19.16b\n" + "zip1 v16.16b, v21.16b, v16.16b\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x40\n" + "cmp %x[height], #0x1\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<4, 16, true, VLType::None>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_4_1x16( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<4, 16, true, VLType::None>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_4_1x16( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp new file mode 100644 index 0000000000..27cebe26cf --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_4_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 4 * roundup(height, 4) * sizeof(uint8_t); + + __asm__ __volatile__( + "cmp %x[height], #0x10\n" + "blt 8f\n" + "1:" // Main row loop: Head + "mov x16, %x[in]\n" + "mov x15, %x[out]\n" + "add x14, x16, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "add x11, x12, %x[in_stride]\n" + "add x10, x11, %x[in_stride]\n" + "add x9, x10, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x10\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q20, [x16], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q19, [x14], #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x13], #0x10\n" + "zip1 v18.16b, v20.16b, v16.16b\n" + "ldr q17, [x12], #0x10\n" + "zip2 v5.16b, v20.16b, v16.16b\n" + "ldr q4, [x11], #0x10\n" + "ldr q3, [x10], #0x10\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "ldr q2, [x9], #0x10\n" + "zip2 v1.16b, v19.16b, v17.16b\n" + "ldr q0, [x28], #0x10\n" + "zip1 v22.16b, v18.16b, v16.16b\n" + "ldr q31, [x27], #0x10\n" + "zip2 v21.16b, v18.16b, v16.16b\n" + "ldr q30, [x26], #0x10\n" + "zip1 v29.16b, v5.16b, v1.16b\n" + "ldr q28, [x25], #0x10\n" + "zip1 v17.16b, v4.16b, v2.16b\n" + "ldr q27, [x24], #0x10\n" + "zip1 v16.16b, v3.16b, v0.16b\n" + "ldr q26, [x23], #0x10\n" + "zip1 v19.16b, v17.16b, v16.16b\n" + "ldr q25, [x22], #0x10\n" + "zip2 v20.16b, v17.16b, v16.16b\n" + "ldr q24, [x21], #0x10\n" + "zip1 v18.16b, v31.16b, v28.16b\n" + "ldr q23, [x20], #0x10\n" + "zip1 v17.16b, v30.16b, v27.16b\n" + "str q22, [x15, #0x0]\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q19, [x15, #0x10]\n" + "zip2 v19.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip1 v18.16b, v26.16b, v24.16b\n" + "zip1 v17.16b, v25.16b, v23.16b\n" + "zip1 v16.16b, v18.16b, v17.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v17.16b, v18.16b, v17.16b\n" + "str q21, [x15, #0x0]\n" + "zip2 v22.16b, v4.16b, v2.16b\n" + "str q20, [x15, #0x10]\n" + "zip2 v21.16b, v3.16b, v0.16b\n" + "str q19, [x15, #0x20]\n" + "zip1 v16.16b, v22.16b, v21.16b\n" + "str q17, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v20.16b, v31.16b, v28.16b\n" + "str q29, [x15, #0x0]\n" + "zip2 v17.16b, v30.16b, v27.16b\n" + "str q16, [x15, #0x10]\n" + "zip1 v16.16b, v20.16b, v17.16b\n" + "str q16, [x15, #0x20]\n" + "zip2 v19.16b, v26.16b, v24.16b\n" + "zip2 v18.16b, v25.16b, v23.16b\n" + "zip1 v16.16b, v19.16b, v18.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v16.16b, v5.16b, v1.16b\n" + "str q16, [x15, #0x0]\n" + "zip2 v16.16b, v22.16b, v21.16b\n" + "zip2 v17.16b, v20.16b, v17.16b\n" + "str q16, [x15, #0x10]\n" + "zip2 v16.16b, v19.16b, v18.16b\n" + "str q17, [x15, #0x20]\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x19, #0x4\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr s18, [x16], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s17, [x14], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x13], #0x4\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr s16, [x12], #0x4\n" + "ldr s18, [x11], #0x4\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr s20, [x10], #0x4\n" + "ldr s17, [x9], #0x4\n" + "zip1 v23.16b, v19.16b, v16.16b\n" + "ldr s16, [x28], #0x4\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr s18, [x27], #0x4\n" + "ldr s22, [x26], #0x4\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "ldr s17, [x25], #0x4\n" + "zip1 v21.16b, v19.16b, v16.16b\n" + "ldr s16, [x24], #0x4\n" + "zip1 v18.16b, v18.16b, v17.16b\n" + "ldr s20, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "ldr s17, [x21], #0x4\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr s16, [x20], #0x4\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str q23, [x15, #0x0]\n" + "str q21, [x15, #0x10]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "str q18, [x15, #0x20]\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x15, #0x30]\n" + "add x15, x15, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cmp x19, #0x1\n" + "blt 7f\n" + "6:" // Main row loop: width 1 loop: loop + "ldr b18, [x16], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b17, [x14], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x13], #0x1\n" + "zip1 v19.16b, v18.16b, v16.16b\n" + "ldr b16, [x12], #0x1\n" + "ldr b18, [x11], #0x1\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "ldr b20, [x10], #0x1\n" + "ldr b17, [x9], #0x1\n" + "zip1 v23.16b, v19.16b, v16.16b\n" + "ldr b16, [x28], #0x1\n" + "zip1 v19.16b, v18.16b, v17.16b\n" + "ldr b18, [x27], #0x1\n" + "ldr b22, [x26], #0x1\n" + "zip1 v16.16b, v20.16b, v16.16b\n" + "ldr b17, [x25], #0x1\n" + "zip1 v21.16b, v19.16b, v16.16b\n" + "ldr b16, [x24], #0x1\n" + "zip1 v18.16b, v18.16b, v17.16b\n" + "ldr b20, [x23], #0x1\n" + "ldr b19, [x22], #0x1\n" + "zip1 v16.16b, v22.16b, v16.16b\n" + "ldr b17, [x21], #0x1\n" + "zip1 v18.16b, v18.16b, v16.16b\n" + "ldr b16, [x20], #0x1\n" + "zip1 v17.16b, v20.16b, v17.16b\n" + "str s23, [x15, #0x0]\n" + "str s21, [x15, #0x10]\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "str s18, [x15, #0x20]\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x15, #0x30]\n" + "add x15, x15, #0x4\n" + "bge 6b\n" + "7:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x40\n" + "cmp %x[height], #0x10\n" + "bge 1b\n" + "cbz %x[height], 16f\n" + "8:" // Main loop skip + + "9:" // Tail row loop: Head + "mov x16, %x[in]\n" + "mov x15, %x[out]\n" + "add x14, x16, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "add %x[in], x12, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x12, x12, %x[pad_row], GT\n" + "csel x13, x13, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x14, x14, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x10\n" + "blt 11f\n" + "10:" // Tail row loop: Unroll column loop + "ldr q19, [x16], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q18, [x14], #0x10\n" + "cmp x19, #0x10\n" + "ldr q17, [x13], #0x10\n" + "zip1 v20.16b, v19.16b, v17.16b\n" + "ldr q16, [x12], #0x10\n" + "zip2 v19.16b, v19.16b, v17.16b\n" + "zip1 v17.16b, v18.16b, v16.16b\n" + "zip2 v18.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v20.16b, v17.16b\n" + "str q16, [x15, #0x0]\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v16.16b, v20.16b, v17.16b\n" + "str q16, [x15, #0x0]\n" + "zip1 v17.16b, v19.16b, v18.16b\n" + "add x15, x15, %x[out_stride]\n" + "zip2 v16.16b, v19.16b, v18.16b\n" + "str q17, [x15, #0x0]\n" + "add x15, x15, %x[out_stride]\n" + "str q16, [x15, #0x0]\n" + "add x15, x15, %x[out_stride]\n" + "bge 10b\n" + "11:" // Tail row loop: Unroll column loop skip + "cmp x19, #0x4\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr s17, [x16], #0x4\n" + "sub x19, x19, #0x4\n" + "ldr s18, [x14], #0x4\n" + "cmp x19, #0x4\n" + "ldr s16, [x13], #0x4\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr s16, [x12], #0x4\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str q16, [x15, #0x0]\n" + "add x15, x15, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cmp x19, #0x1\n" + "blt 15f\n" + "14:" // Tail row loop: width 1 loop: loop + "ldr b17, [x16], #0x1\n" + "sub x19, x19, #0x1\n" + "ldr b18, [x14], #0x1\n" + "cmp x19, #0x1\n" + "ldr b16, [x13], #0x1\n" + "zip1 v17.16b, v17.16b, v16.16b\n" + "ldr b16, [x12], #0x1\n" + "zip1 v16.16b, v18.16b, v16.16b\n" + "zip1 v16.16b, v17.16b, v16.16b\n" + "str s16, [x15, #0x0]\n" + "add x15, x15, #0x4\n" + "bge 14b\n" + "15:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x10\n" + "cmp %x[height], #0x1\n" + "bge 9b\n" + "16:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace + +template<> +void Transform<4, 4, true, VLType::None>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_4_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<4, 4, true, VLType::None>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_4_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp new file mode 100644 index 0000000000..c341b315aa --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 32 * height * sizeof(uint16_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q31, [x24], #0x10\n" + "sub x19, x19, #0x20\n" + "ldr q30, [x22], #0x10\n" + "cmp x19, #0x20\n" + "ldr q29, [x21], #0x10\n" + "ldr q28, [x20], #0x10\n" + "ldr q27, [x24], #0x10\n" + "ldr q26, [x22], #0x10\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x22], #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q31, [x23, #0x0]\n" + "str q27, [x23, #0x10]\n" + "str q23, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q30, [x23, #0x40]\n" + "str q26, [x23, #0x50]\n" + "str q22, [x23, #0x60]\n" + "str q18, [x23, #0x70]\n" + "str q29, [x23, #0x80]\n" + "str q25, [x23, #0x90]\n" + "str q21, [x23, #0xa0]\n" + "str q17, [x23, #0xb0]\n" + "str q28, [x23, #0xc0]\n" + "str q24, [x23, #0xd0]\n" + "str q20, [x23, #0xe0]\n" + "str q16, [x23, #0xf0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: width 16 loop: loop + "ldr q23, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q22, [x22], #0x10\n" + "cmp x19, #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q23, [x23, #0x0]\n" + "str q19, [x23, #0x10]\n" + "str q22, [x23, #0x40]\n" + "str q18, [x23, #0x50]\n" + "str q21, [x23, #0x80]\n" + "str q17, [x23, #0x90]\n" + "str q20, [x23, #0xc0]\n" + "str q16, [x23, #0xd0]\n" + "add x23, x23, #0x20\n" + "bge 4b\n" + "5:" // Main row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr d19, [x24], #0x8\n" + "sub x19, x19, #0x4\n" + "ldr d18, [x22], #0x8\n" + "cmp x19, #0x4\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "str d19, [x23, #0x0]\n" + "str d18, [x23, #0x40]\n" + "str d17, [x23, #0x80]\n" + "str d16, [x23, #0xc0]\n" + "add x23, x23, #0x8\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr h19, [x24], #0x2\n" + "sub x19, x19, #0x1\n" + "ldr h18, [x22], #0x2\n" + "cmp x19, #0x1\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "str h19, [x23, #0x0]\n" + "str h18, [x23, #0x40]\n" + "str h17, [x23, #0x80]\n" + "str h16, [x23, #0xc0]\n" + "add x23, x23, #0x2\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x100\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x20\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr q19, [x24], #0x10\n" + "sub x19, x19, #0x20\n" + "cmp x19, #0x20\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q19, [x23, #0x0]\n" + "str q18, [x23, #0x10]\n" + "str q17, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 15f\n" + "14:" // Tail row loop: width 16 loop: loop + "ldr q17, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "cmp x19, #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q17, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "add x23, x23, #0x20\n" + "bge 14b\n" + "15:" // Tail row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr d16, [x24], #0x8\n" + "sub x19, x19, #0x4\n" + "cmp x19, #0x4\n" + "str d16, [x23, #0x0]\n" + "add x23, x23, #0x8\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr h16, [x24], #0x2\n" + "sub x19, x19, #0x1\n" + "cmp x19, #0x1\n" + "str h16, [x23, #0x0]\n" + "add x23, x23, #0x2\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x40\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace + +template<> +void Transform<16, 1, true, VLType::None>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_64( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 2, + stride * sizeof(float), + (kmax-k0) + ); +} + +template<> +void Transform<32, 1, true, VLType::None>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_64( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + +template<> +void Transform<32, 1, true, VLType::None>( + uint16_t *out, const uint16_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_64( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint16_t) / 2, + stride * sizeof(uint16_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp deleted file mode 100644 index df68740bb4..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2017-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 8x32-bit sized specialisation -template <> -template -inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 16 x uint16_t specialisation - TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 16x16-bit sized specialisation -template <> -template -inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 16 x uint16_t version -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *const out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR q1, [%[in0], #0x10]\n" - "STR q1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x20\n" - ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r" (in0) - : [out] "r" (out) - : "v0", "v1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *const out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR q1, [%[in0], #0x10]\n" - "STR q1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x20\n" - ASM_PREFETCH("[%[in0], #192]") - - "LDR q2, [%[in1]]\n" - "STR q2, [%[out], #0x20]\n" - "LDR q3, [%[in1], #0x10]\n" - "STR q3, [%[out], #0x30]\n" - "ADD %x[in1], %x[in1], #0x20\n" - ASM_PREFETCH("[%[in1], #192]") - : [in0] "+r" (in0), - [in1] "+r" (in1) - : [out] "r" (out) - : "v0", "v1", "v2", "v3", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *const out) { - __asm __volatile ( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR q1, [%[in0], #0x10]\n" - "STR q1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x20\n" - ASM_PREFETCH("[%[in0], #192]") - - "LDR q2, [%[in1]]\n" - "STR q2, [%[out], #0x20]\n" - "LDR q3, [%[in1], #0x10]\n" - "STR q3, [%[out], #0x30]\n" - "ADD %x[in1], %x[in1], #0x20\n" - ASM_PREFETCH("[%[in1], #192]") - - "LDR q0, [%[in2]]\n" - "STR q0, [%[out], #0x40]\n" - "LDR q1, [%[in2], #0x10]\n" - "STR q1, [%[out], #0x50]\n" - "ADD %x[in2], %x[in2], #0x20\n" - ASM_PREFETCH("[%[in2], #192]") - - "LDR q2, [%[in3]]\n" - "STR q2, [%[out], #0x60]\n" - "LDR q3, [%[in3], #0x10]\n" - "STR q3, [%[out], #0x70]\n" - "ADD %x[in3], %x[in3], #0x20\n" - ASM_PREFETCH("[%[in3], #192]") - : [in0] "+r" (in0), - [in1] "+r" (in1), - [in2] "+r" (in2), - [in3] "+r" (in3) - : [out] "r" (out) - : "v0", "v1", "v2", "v3", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp new file mode 100644 index 0000000000..190999ba53 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __aarch64__ + +namespace { + +void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 24 * height * sizeof(uint32_t); + + __asm__ __volatile__( + "cmp %x[height], #0x4\n" + "blt 10f\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add x22, x24, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 3f\n" + "2:" // Main row loop: Column loop + "ldr q7, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "ldr q6, [x22], #0x10\n" + "cmp x19, #0x18\n" + "ldr q5, [x21], #0x10\n" + "ldr q4, [x20], #0x10\n" + "ldr q3, [x24], #0x10\n" + "ldr q2, [x22], #0x10\n" + "ldr q1, [x21], #0x10\n" + "ldr q0, [x20], #0x10\n" + "ldr q31, [x24], #0x10\n" + "ldr q30, [x22], #0x10\n" + "ldr q29, [x21], #0x10\n" + "ldr q28, [x20], #0x10\n" + "ldr q27, [x24], #0x10\n" + "ldr q26, [x22], #0x10\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x22], #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q7, [x23, #0x0]\n" + "str q3, [x23, #0x10]\n" + "str q31, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "str q23, [x23, #0x40]\n" + "str q19, [x23, #0x50]\n" + "str q6, [x23, #0x60]\n" + "str q2, [x23, #0x70]\n" + "str q30, [x23, #0x80]\n" + "str q26, [x23, #0x90]\n" + "str q22, [x23, #0xa0]\n" + "str q18, [x23, #0xb0]\n" + "str q5, [x23, #0xc0]\n" + "str q1, [x23, #0xd0]\n" + "str q29, [x23, #0xe0]\n" + "str q25, [x23, #0xf0]\n" + "str q21, [x23, #0x100]\n" + "str q17, [x23, #0x110]\n" + "str q4, [x23, #0x120]\n" + "str q0, [x23, #0x130]\n" + "str q28, [x23, #0x140]\n" + "str q24, [x23, #0x150]\n" + "str q20, [x23, #0x160]\n" + "str q16, [x23, #0x170]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 5f\n" + "4:" // Main row loop: width 16 loop: loop + "ldr q31, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "ldr q30, [x22], #0x10\n" + "cmp x19, #0x10\n" + "ldr q29, [x21], #0x10\n" + "ldr q28, [x20], #0x10\n" + "ldr q27, [x24], #0x10\n" + "ldr q26, [x22], #0x10\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x22], #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q31, [x23, #0x0]\n" + "str q27, [x23, #0x10]\n" + "str q23, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q30, [x23, #0x60]\n" + "str q26, [x23, #0x70]\n" + "str q22, [x23, #0x80]\n" + "str q18, [x23, #0x90]\n" + "str q29, [x23, #0xc0]\n" + "str q25, [x23, #0xd0]\n" + "str q21, [x23, #0xe0]\n" + "str q17, [x23, #0xf0]\n" + "str q28, [x23, #0x120]\n" + "str q24, [x23, #0x130]\n" + "str q20, [x23, #0x140]\n" + "str q16, [x23, #0x150]\n" + "add x23, x23, #0x40\n" + "bge 4b\n" + "5:" // Main row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 7f\n" + "6:" // Main row loop: width 4 loop: loop + "ldr q19, [x24], #0x10\n" + "sub x19, x19, #0x4\n" + "ldr q18, [x22], #0x10\n" + "cmp x19, #0x4\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q19, [x23, #0x0]\n" + "str q18, [x23, #0x60]\n" + "str q17, [x23, #0xc0]\n" + "str q16, [x23, #0x120]\n" + "add x23, x23, #0x10\n" + "bge 6b\n" + "7:" // Main row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 9f\n" + "8:" // Main row loop: width 1 loop: loop + "ldr s19, [x24], #0x4\n" + "sub x19, x19, #0x1\n" + "ldr s18, [x22], #0x4\n" + "cmp x19, #0x1\n" + "ldr s17, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "str s19, [x23, #0x0]\n" + "str s18, [x23, #0x60]\n" + "str s17, [x23, #0xc0]\n" + "str s16, [x23, #0x120]\n" + "add x23, x23, #0x4\n" + "bge 8b\n" + "9:" // Main row loop: width 1 loop: skip + "add %x[out], %x[out], #0x180\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 20f\n" + "10:" // Main loop skip + + "11:" // Tail row loop: Head + "mov x24, %x[in]\n" + "mov x23, %x[out]\n" + "add %x[in], x24, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x19, %x[width]\n" + "cmp x19, #0x18\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr q21, [x24], #0x10\n" + "sub x19, x19, #0x18\n" + "cmp x19, #0x18\n" + "ldr q20, [x24], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q21, [x23, #0x0]\n" + "str q20, [x23, #0x10]\n" + "str q19, [x23, #0x20]\n" + "str q18, [x23, #0x30]\n" + "str q17, [x23, #0x40]\n" + "str q16, [x23, #0x50]\n" + "add x23, x23, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cmp x19, #0x10\n" + "blt 15f\n" + "14:" // Tail row loop: width 16 loop: loop + "ldr q19, [x24], #0x10\n" + "sub x19, x19, #0x10\n" + "cmp x19, #0x10\n" + "ldr q18, [x24], #0x10\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x24], #0x10\n" + "str q19, [x23, #0x0]\n" + "str q18, [x23, #0x10]\n" + "str q17, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "add x23, x23, #0x40\n" + "bge 14b\n" + "15:" // Tail row loop: width 16 loop: skip + "cmp x19, #0x4\n" + "blt 17f\n" + "16:" // Tail row loop: width 4 loop: loop + "ldr q16, [x24], #0x10\n" + "sub x19, x19, #0x4\n" + "cmp x19, #0x4\n" + "str q16, [x23, #0x0]\n" + "add x23, x23, #0x10\n" + "bge 16b\n" + "17:" // Tail row loop: width 4 loop: skip + "cmp x19, #0x1\n" + "blt 19f\n" + "18:" // Tail row loop: width 1 loop: loop + "ldr s16, [x24], #0x4\n" + "sub x19, x19, #0x1\n" + "cmp x19, #0x1\n" + "str s16, [x23, #0x0]\n" + "add x23, x23, #0x4\n" + "bge 18b\n" + "19:" // Tail row loop: width 1 loop: skip + "add %x[out], %x[out], #0x60\n" + "cmp %x[height], #0x1\n" + "bge 11b\n" + "20:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // anonymous namespace + +template<> +void Transform<24, 1, true, VLType::None>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_96( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 4, + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp new file mode 100644 index 0000000000..895177b6cc --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "sve_transpose_interleave_12VL_2x4_fp32bf16.hpp" +#include "sve_transpose_interleave_1VL_1x4.hpp" +#include "sve_transpose_interleave_1VL.hpp" +#include "sve_transpose_interleave_3VL_1x4.hpp" +#include "sve_transpose_interleave_3VL_2x2.hpp" +#include "sve_transpose_interleave_3VL.hpp" +#include "sve_transpose_interleave_4VL_1x4.hpp" +#include "sve_transpose_interleave_4VL_2x2.hpp" +#include "sve_transpose_interleave_4VL.hpp" +#include "sve_transpose_interleave_6VL_1x8.hpp" +#include "sve_transpose_interleave_6VL_2x4_fp32bf16.hpp" +#include "sve_transpose_interleave_6VL_2x4.hpp" +#include "sve_transpose_interleave_6VL_4x2.hpp" +#include "sve_transpose_interleave_8VL_1x4.hpp" +#include "sve_transpose_interleave_8VL_1x8.hpp" +#include "sve_transpose_interleave_8VL_2x2.hpp" +#include "sve_transpose_interleave_8VL_2x4.hpp" +#include "sve_transpose_interleave_8VL_2x4_fp32bf16.hpp" +#include "sve_transpose_interleave_8VL.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp index e092c729ba..adbaa6cf2f 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -22,7 +22,28 @@ * SOFTWARE. */ #include "a32_transpose_interleave_8way_32bit.hpp" -#include "a64_transpose_interleave_12way_16bit.hpp" -#include "a64_transpose_interleave_12way_half_to_float.hpp" -#include "a64_transpose_interleave_24way_16bit.hpp" -#include "a64_transpose_interleave_8way_32bit.hpp" +#include "a64_transpose_interleave_12_1x4.hpp" +#include "a64_transpose_interleave_12_1x8.hpp" +#include "a64_transpose_interleave_12_2x2.hpp" +#include "a64_transpose_interleave_12_2x4_fp32bf16.hpp" +#include "a64_transpose_interleave_12_2x4.hpp" +#include "a64_transpose_interleave_128.hpp" +#include "a64_transpose_interleave_12_s8s16.hpp" +#include "a64_transpose_interleave_12_u8u16.hpp" +#include "a64_transpose_interleave_16_1x4.hpp" +#include "a64_transpose_interleave_16_1x8.hpp" +#include "a64_transpose_interleave_16_2x2.hpp" +#include "a64_transpose_interleave_16_2x4.hpp" +#include "a64_transpose_interleave_16_2x4_fp32bf16.hpp" +#include "a64_transpose_interleave_16.hpp" +#include "a64_transpose_interleave_24_bf16fp32.hpp" +#include "a64_transpose_interleave_24_fp16fp32.hpp" +#include "a64_transpose_interleave_24_2x4_fp32bf16.hpp" +#include "a64_transpose_interleave_24.hpp" +#include "a64_transpose_interleave_32_1x4.hpp" +#include "a64_transpose_interleave_32_2x2.hpp" +#include "a64_transpose_interleave_4_1x16.hpp" +#include "a64_transpose_interleave_4_1x4.hpp" +#include "a64_transpose_interleave_48.hpp" +#include "a64_transpose_interleave_64.hpp" +#include "a64_transpose_interleave_96.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..ef94cbad39 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_12VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 12 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p6.b\n" + "1:" // Main row loop: Head + "mov x27, %x[in]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "mov x24, %x[width]\n" + "cnth x23, ALL, MUL #6\n" + "add x22, x25, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "add %x[in], x22, %x[in_stride]\n" + "csel x22, x22, %x[pad_row], GT\n" + "csel x25, x25, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x26, x26, %x[pad_row], GT\n" + "cmp x24, x23\n" + "mov x21, %x[out]\n" + "sub %x[height], %x[height], #0x4\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1w { z22.s }, p6/Z, [x27]\n" + "ld1w { z7.s }, p6/Z, [x27, #1, MUL VL]\n" + "mov x20, x21\n" + "add x21, x21, %x[out_stride]\n" + "ld1w { z19.s }, p6/Z, [x27, #2, MUL VL]\n" + "ld1w { z18.s }, p6/Z, [x27, #3, MUL VL]\n" + "mov x19, x21\n" + "sub x24, x24, x23\n" + "ld1w { z5.s }, p6/Z, [x27, #4, MUL VL]\n" + "ld1w { z25.s }, p6/Z, [x27, #5, MUL VL]\n" + "cmp x24, x23\n" + "add x21, x21, %x[out_stride]\n" + "ld1w { z20.s }, p6/Z, [x27, #6, MUL VL]\n" + "ld1w { z23.s }, p6/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #12\n" + "ld1w { z4.s }, p6/Z, [x25]\n" + "ld1w { z10.s }, p6/Z, [x25, #1, MUL VL]\n" + "zip1 z14.s, z22.s, z4.s\n" + "zip2 z22.s, z22.s, z4.s\n" + "ld1w { z28.s }, p6/Z, [x25, #2, MUL VL]\n" + "ld1w { z27.s }, p6/Z, [x25, #3, MUL VL]\n" + "zip1 z24.s, z7.s, z10.s\n" + "zip2 z15.s, z7.s, z10.s\n" + "ld1w { z7.s }, p6/Z, [x25, #4, MUL VL]\n" + "ld1w { z2.s }, p6/Z, [x25, #5, MUL VL]\n" + "zip1 z9.s, z19.s, z28.s\n" + "zip2 z0.s, z19.s, z28.s\n" + "ld1w { z19.s }, p6/Z, [x25, #6, MUL VL]\n" + "ld1w { z16.s }, p6/Z, [x25, #7, MUL VL]\n" + "addvl x25, x25, #12\n" + "zip1 z1.s, z18.s, z27.s\n" + "ld1w { z30.s }, p6/Z, [x27, #-4, MUL VL]\n" + "ld1w { z29.s }, p6/Z, [x27, #-3, MUL VL]\n" + "zip2 z17.s, z18.s, z27.s\n" + ".inst 0x658ab9d5 // bfcvt z21.h, p6/M, z14.s\n" + "ld1w { z31.s }, p6/Z, [x26]\n" + "ld1w { z8.s }, p6/Z, [x26, #1, MUL VL]\n" + ".inst 0x658abacc // bfcvt z12.h, p6/M, z22.s\n" + ".inst 0x658abb0e // bfcvt z14.h, p6/M, z24.s\n" + "ld1w { z22.s }, p6/Z, [x26, #2, MUL VL]\n" + "ld1w { z28.s }, p6/Z, [x26, #3, MUL VL]\n" + ".inst 0x658ab9ea // bfcvt z10.h, p6/M, z15.s\n" + ".inst 0x658ab92f // bfcvt z15.h, p6/M, z9.s\n" + "ld1w { z27.s }, p6/Z, [x26, #4, MUL VL]\n" + "ld1w { z13.s }, p6/Z, [x26, #5, MUL VL]\n" + ".inst 0x658ab803 // bfcvt z3.h, p6/M, z0.s\n" + ".inst 0x658ab832 // bfcvt z18.h, p6/M, z1.s\n" + "ld1w { z26.s }, p6/Z, [x26, #6, MUL VL]\n" + "ld1w { z9.s }, p6/Z, [x26, #7, MUL VL]\n" + "addvl x26, x26, #12\n" + ".inst 0x658aba26 // bfcvt z6.h, p6/M, z17.s\n" + "ld1w { z1.s }, p6/Z, [x25, #-4, MUL VL]\n" + "ld1w { z0.s }, p6/Z, [x25, #-3, MUL VL]\n" + "zip1 z17.s, z5.s, z7.s\n" + "zip2 z5.s, z5.s, z7.s\n" + "ld1w { z24.s }, p6/Z, [x22]\n" + "ld1w { z11.s }, p6/Z, [x22, #1, MUL VL]\n" + "zip1 z7.s, z31.s, z24.s\n" + "zip2 z31.s, z31.s, z24.s\n" + "ld1w { z4.s }, p6/Z, [x22, #2, MUL VL]\n" + "ld1w { z24.s }, p6/Z, [x22, #3, MUL VL]\n" + ".inst 0x648ab8f5 // bfcvtnt z21.h, p6/M, z7.s\n" + "zip1 z7.s, z8.s, z11.s\n" + "zip2 z11.s, z8.s, z11.s\n" + "ld1w { z8.s }, p6/Z, [x22, #4, MUL VL]\n" + ".inst 0x648abbec // bfcvtnt z12.h, p6/M, z31.s\n" + "ld1w { z31.s }, p6/Z, [x22, #5, MUL VL]\n" + ".inst 0x648ab8ee // bfcvtnt z14.h, p6/M, z7.s\n" + "ld1w { z7.s }, p6/Z, [x22, #6, MUL VL]\n" + ".inst 0x648ab96a // bfcvtnt z10.h, p6/M, z11.s\n" + "zip1 z11.s, z22.s, z4.s\n" + "zip2 z4.s, z22.s, z4.s\n" + "ld1w { z22.s }, p6/Z, [x22, #7, MUL VL]\n" + "addvl x22, x22, #12\n" + ".inst 0x648ab96f // bfcvtnt z15.h, p6/M, z11.s\n" + "ld1w { z11.s }, p6/Z, [x27, #-2, MUL VL]\n" + ".inst 0x648ab883 // bfcvtnt z3.h, p6/M, z4.s\n" + "zip1 z4.s, z28.s, z24.s\n" + "zip2 z24.s, z28.s, z24.s\n" + "ld1w { z28.s }, p6/Z, [x27, #-1, MUL VL]\n" + ".inst 0x648ab892 // bfcvtnt z18.h, p6/M, z4.s\n" + "ld1w { z4.s }, p6/Z, [x26, #-4, MUL VL]\n" + ".inst 0x648abb06 // bfcvtnt z6.h, p6/M, z24.s\n" + "zip1 z24.s, z25.s, z2.s\n" + "zip2 z25.s, z25.s, z2.s\n" + "zip1 z2.s, z20.s, z19.s\n" + "zip2 z20.s, z20.s, z19.s\n" + "zip1 z19.s, z23.s, z16.s\n" + "zip2 z16.s, z23.s, z16.s\n" + "zip1 z23.s, z30.s, z1.s\n" + "zip2 z30.s, z30.s, z1.s\n" + "zip1 z1.s, z29.s, z0.s\n" + "zip2 z0.s, z29.s, z0.s\n" + ".inst 0x658aba31 // bfcvt z17.h, p6/M, z17.s\n" + "zip1 z29.s, z27.s, z8.s\n" + ".inst 0x658ab8a5 // bfcvt z5.h, p6/M, z5.s\n" + "zip2 z27.s, z27.s, z8.s\n" + "ld1w { z8.s }, p6/Z, [x26, #-3, MUL VL]\n" + ".inst 0x658abb18 // bfcvt z24.h, p6/M, z24.s\n" + ".inst 0x658abb39 // bfcvt z25.h, p6/M, z25.s\n" + ".inst 0x658ab842 // bfcvt z2.h, p6/M, z2.s\n" + ".inst 0x658aba94 // bfcvt z20.h, p6/M, z20.s\n" + ".inst 0x658aba73 // bfcvt z19.h, p6/M, z19.s\n" + ".inst 0x658aba10 // bfcvt z16.h, p6/M, z16.s\n" + ".inst 0x658abaf7 // bfcvt z23.h, p6/M, z23.s\n" + ".inst 0x658abbde // bfcvt z30.h, p6/M, z30.s\n" + ".inst 0x658ab821 // bfcvt z1.h, p6/M, z1.s\n" + ".inst 0x658ab800 // bfcvt z0.h, p6/M, z0.s\n" + ".inst 0x648abbb1 // bfcvtnt z17.h, p6/M, z29.s\n" + "ld1w { z29.s }, p6/Z, [x25, #-2, MUL VL]\n" + ".inst 0x648abb65 // bfcvtnt z5.h, p6/M, z27.s\n" + "zip1 z27.s, z13.s, z31.s\n" + "zip2 z31.s, z13.s, z31.s\n" + "ld1w { z13.s }, p6/Z, [x25, #-1, MUL VL]\n" + ".inst 0x648abb78 // bfcvtnt z24.h, p6/M, z27.s\n" + "ld1w { z27.s }, p6/Z, [x22, #-4, MUL VL]\n" + ".inst 0x648abbf9 // bfcvtnt z25.h, p6/M, z31.s\n" + "zip1 z31.s, z26.s, z7.s\n" + "zip2 z26.s, z26.s, z7.s\n" + "ld1w { z7.s }, p6/Z, [x22, #-3, MUL VL]\n" + ".inst 0x648abbe2 // bfcvtnt z2.h, p6/M, z31.s\n" + "ld1w { z31.s }, p6/Z, [x26, #-2, MUL VL]\n" + ".inst 0x648abb54 // bfcvtnt z20.h, p6/M, z26.s\n" + "zip1 z26.s, z9.s, z22.s\n" + "zip2 z9.s, z9.s, z22.s\n" + "ld1w { z22.s }, p6/Z, [x26, #-1, MUL VL]\n" + ".inst 0x648abb53 // bfcvtnt z19.h, p6/M, z26.s\n" + "ld1w { z26.s }, p6/Z, [x22, #-2, MUL VL]\n" + ".inst 0x648ab930 // bfcvtnt z16.h, p6/M, z9.s\n" + "ld1w { z9.s }, p6/Z, [x22, #-1, MUL VL]\n" + "st1h { z21.h }, p6, [x20]\n" + "zip1 z21.s, z4.s, z27.s\n" + "zip2 z27.s, z4.s, z27.s\n" + "zip1 z4.s, z8.s, z7.s\n" + "zip2 z8.s, z8.s, z7.s\n" + "st1h { z12.h }, p6, [x20, #1, MUL VL]\n" + "zip1 z7.s, z11.s, z29.s\n" + "zip2 z11.s, z11.s, z29.s\n" + "st1h { z14.h }, p6, [x20, #2, MUL VL]\n" + "zip1 z29.s, z28.s, z13.s\n" + "zip2 z12.s, z28.s, z13.s\n" + "st1h { z10.h }, p6, [x20, #3, MUL VL]\n" + "st1h { z15.h }, p6, [x20, #4, MUL VL]\n" + ".inst 0x648abab7 // bfcvtnt z23.h, p6/M, z21.s\n" + ".inst 0x648abb7e // bfcvtnt z30.h, p6/M, z27.s\n" + "st1h { z3.h }, p6, [x20, #5, MUL VL]\n" + ".inst 0x648ab881 // bfcvtnt z1.h, p6/M, z4.s\n" + ".inst 0x648ab900 // bfcvtnt z0.h, p6/M, z8.s\n" + "st1h { z18.h }, p6, [x20, #6, MUL VL]\n" + ".inst 0x658ab8e8 // bfcvt z8.h, p6/M, z7.s\n" + "zip1 z27.s, z31.s, z26.s\n" + "st1h { z6.h }, p6, [x20, #7, MUL VL]\n" + "addvl x20, x20, #12\n" + ".inst 0x658ab96e // bfcvt z14.h, p6/M, z11.s\n" + "zip2 z28.s, z31.s, z26.s\n" + ".inst 0x658abbbd // bfcvt z29.h, p6/M, z29.s\n" + "zip1 z21.s, z22.s, z9.s\n" + "st1h { z17.h }, p6, [x20, #-4, MUL VL]\n" + ".inst 0x658ab992 // bfcvt z18.h, p6/M, z12.s\n" + "zip2 z17.s, z22.s, z9.s\n" + "st1h { z5.h }, p6, [x20, #-3, MUL VL]\n" + "st1h { z24.h }, p6, [x20, #-2, MUL VL]\n" + ".inst 0x648abb68 // bfcvtnt z8.h, p6/M, z27.s\n" + ".inst 0x648abb8e // bfcvtnt z14.h, p6/M, z28.s\n" + "st1h { z25.h }, p6, [x20, #-1, MUL VL]\n" + ".inst 0x648ababd // bfcvtnt z29.h, p6/M, z21.s\n" + ".inst 0x648aba32 // bfcvtnt z18.h, p6/M, z17.s\n" + "st1h { z2.h }, p6, [x19]\n" + "st1h { z20.h }, p6, [x19, #1, MUL VL]\n" + "st1h { z19.h }, p6, [x19, #2, MUL VL]\n" + "st1h { z16.h }, p6, [x19, #3, MUL VL]\n" + "st1h { z23.h }, p6, [x19, #4, MUL VL]\n" + "st1h { z30.h }, p6, [x19, #5, MUL VL]\n" + "st1h { z1.h }, p6, [x19, #6, MUL VL]\n" + "st1h { z0.h }, p6, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "st1h { z8.h }, p6, [x19, #-4, MUL VL]\n" + "st1h { z14.h }, p6, [x19, #-3, MUL VL]\n" + "st1h { z29.h }, p6, [x19, #-2, MUL VL]\n" + "st1h { z18.h }, p6, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x24, 5f\n" + "4:" // Main row loop: Column loop + "mov x19, x24\n" + "whilelt p5.s, XZR, x19\n" + "ld1w { z22.s }, p5/Z, [x27]\n" + "ld1w { z21.s }, p5/Z, [x25]\n" + "decw x19\n" + "whilelt p4.s, XZR, x19\n" + "ld1w { z20.s }, p4/Z, [x27, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x25, #1, MUL VL]\n" + "decw x19\n" + "whilelt p3.s, XZR, x19\n" + "ld1w { z18.s }, p3/Z, [x27, #2, MUL VL]\n" + "ld1w { z17.s }, p3/Z, [x25, #2, MUL VL]\n" + "decw x19\n" + "whilelt p2.s, XZR, x19\n" + "ld1w { z30.s }, p2/Z, [x27, #3, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x25, #3, MUL VL]\n" + "decw x19\n" + "whilelt p1.s, XZR, x19\n" + "ld1w { z13.s }, p1/Z, [x27, #4, MUL VL]\n" + "ld1w { z29.s }, p5/Z, [x26]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z12.s }, p0/Z, [x27, #5, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x26, #1, MUL VL]\n" + "ld1w { z11.s }, p3/Z, [x26, #2, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x26, #3, MUL VL]\n" + "zip1 z27.s, z22.s, z21.s\n" + "zip2 z26.s, z22.s, z21.s\n" + "ld1w { z9.s }, p1/Z, [x25, #4, MUL VL]\n" + "ld1w { z8.s }, p0/Z, [x25, #5, MUL VL]\n" + "zip1 z25.s, z20.s, z19.s\n" + "zip2 z24.s, z20.s, z19.s\n" + "ld1w { z23.s }, p5/Z, [x22]\n" + "ld1w { z22.s }, p4/Z, [x22, #1, MUL VL]\n" + "zip1 z21.s, z18.s, z17.s\n" + "zip2 z20.s, z18.s, z17.s\n" + "ld1w { z19.s }, p3/Z, [x22, #2, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #3, MUL VL]\n" + "zip1 z17.s, z30.s, z16.s\n" + "zip2 z16.s, z30.s, z16.s\n" + "ld1w { z7.s }, p1/Z, [x26, #4, MUL VL]\n" + "ld1w { z6.s }, p0/Z, [x26, #5, MUL VL]\n" + ".inst 0x658abb65 // bfcvt z5.h, p6/M, z27.s\n" + "zip1 z4.s, z29.s, z23.s\n" + "ld1w { z3.s }, p1/Z, [x22, #4, MUL VL]\n" + "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n" + ".inst 0x658abb41 // bfcvt z1.h, p6/M, z26.s\n" + "zip2 z0.s, z29.s, z23.s\n" + ".inst 0x658abb3f // bfcvt z31.h, p6/M, z25.s\n" + "zip1 z30.s, z28.s, z22.s\n" + "mov x19, x21\n" + "decd x24, ALL, MUL #12\n" + ".inst 0x658abb1d // bfcvt z29.h, p6/M, z24.s\n" + "zip2 z28.s, z28.s, z22.s\n" + "cmp x24, #0x0\n" + "addvl x27, x27, #6\n" + ".inst 0x658ababb // bfcvt z27.h, p6/M, z21.s\n" + "zip1 z23.s, z11.s, z19.s\n" + "addvl x26, x26, #6\n" + "addvl x25, x25, #6\n" + ".inst 0x658aba9a // bfcvt z26.h, p6/M, z20.s\n" + "zip2 z22.s, z11.s, z19.s\n" + "addvl x22, x22, #6\n" + "add x21, x21, %x[out_stride]\n" + ".inst 0x658aba39 // bfcvt z25.h, p6/M, z17.s\n" + "zip1 z21.s, z10.s, z18.s\n" + ".inst 0x658aba18 // bfcvt z24.h, p6/M, z16.s\n" + "zip2 z20.s, z10.s, z18.s\n" + "zip1 z19.s, z13.s, z9.s\n" + "zip2 z18.s, z13.s, z9.s\n" + "zip1 z17.s, z12.s, z8.s\n" + "zip2 z16.s, z12.s, z8.s\n" + ".inst 0x648ab885 // bfcvtnt z5.h, p6/M, z4.s\n" + ".inst 0x648ab801 // bfcvtnt z1.h, p6/M, z0.s\n" + "st1h { z5.h }, p6, [x19]\n" + ".inst 0x648abbdf // bfcvtnt z31.h, p6/M, z30.s\n" + ".inst 0x648abb9d // bfcvtnt z29.h, p6/M, z28.s\n" + "st1h { z1.h }, p6, [x19, #1, MUL VL]\n" + ".inst 0x648abafb // bfcvtnt z27.h, p6/M, z23.s\n" + ".inst 0x648abada // bfcvtnt z26.h, p6/M, z22.s\n" + "st1h { z31.h }, p6, [x19, #2, MUL VL]\n" + ".inst 0x648abab9 // bfcvtnt z25.h, p6/M, z21.s\n" + ".inst 0x648aba98 // bfcvtnt z24.h, p6/M, z20.s\n" + "st1h { z29.h }, p6, [x19, #3, MUL VL]\n" + ".inst 0x658aba77 // bfcvt z23.h, p6/M, z19.s\n" + "zip1 z22.s, z7.s, z3.s\n" + "st1h { z27.h }, p6, [x19, #4, MUL VL]\n" + ".inst 0x658aba55 // bfcvt z21.h, p6/M, z18.s\n" + "zip2 z20.s, z7.s, z3.s\n" + "st1h { z26.h }, p6, [x19, #5, MUL VL]\n" + ".inst 0x658aba33 // bfcvt z19.h, p6/M, z17.s\n" + "zip1 z18.s, z6.s, z2.s\n" + "st1h { z25.h }, p6, [x19, #6, MUL VL]\n" + ".inst 0x658aba11 // bfcvt z17.h, p6/M, z16.s\n" + "zip2 z16.s, z6.s, z2.s\n" + "st1h { z24.h }, p6, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + ".inst 0x648abad7 // bfcvtnt z23.h, p6/M, z22.s\n" + ".inst 0x648aba95 // bfcvtnt z21.h, p6/M, z20.s\n" + "st1h { z23.h }, p6, [x19, #-4, MUL VL]\n" + ".inst 0x648aba53 // bfcvtnt z19.h, p6/M, z18.s\n" + ".inst 0x648aba11 // bfcvtnt z17.h, p6/M, z16.s\n" + "st1h { z21.h }, p6, [x19, #-3, MUL VL]\n" + "st1h { z19.h }, p6, [x19, #-2, MUL VL]\n" + "st1h { z17.h }, p6, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #12\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace +template<> +void Transform<12, 4, true, VLType::SVE>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_12VL_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp new file mode 100644 index 0000000000..33694dfb0c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 1 * height * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "cmp %x[height], #0x4\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x25, %x[in]\n" + "mov x24, %x[out]\n" + "add x23, x25, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add %x[in], x21, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x20, %x[width]\n" + "cntw x19, ALL, MUL #2\n" + "cmp x20, x19\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1w { z23.s }, p1/Z, [x25]\n" + "sub x20, x20, x19\n" + "ld1w { z22.s }, p1/Z, [x25, #1, MUL VL]\n" + "addvl x25, x25, #2\n" + "ld1w { z21.s }, p1/Z, [x23]\n" + "cmp x20, x19\n" + "ld1w { z20.s }, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "ld1w { z19.s }, p1/Z, [x22]\n" + "ld1w { z18.s }, p1/Z, [x22, #1, MUL VL]\n" + "addvl x22, x22, #2\n" + "ld1w { z17.s }, p1/Z, [x21]\n" + "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n" + "addvl x21, x21, #2\n" + "st1w { z23.s }, p1, [x24]\n" + "st1w { z21.s }, p1, [x24, #1, MUL VL]\n" + "st1w { z19.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z17.s }, p1, [x24, #3, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "st1w { z22.s }, p1, [x24]\n" + "st1w { z20.s }, p1, [x24, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z16.s }, p1, [x24, #3, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x20, 5f\n" + "4:" // Main row loop: Column loop + "whilelt p0.s, XZR, x20\n" + "ld1w { z19.s }, p0/Z, [x25]\n" + "addvl x25, x25, #1\n" + "ld1w { z18.s }, p0/Z, [x23]\n" + "addvl x23, x23, #1\n" + "ld1w { z17.s }, p0/Z, [x22]\n" + "addvl x22, x22, #1\n" + "ld1w { z16.s }, p0/Z, [x21]\n" + "addvl x21, x21, #1\n" + "st1w { z19.s }, p1, [x24]\n" + "decw x20\n" + "st1w { z18.s }, p1, [x24, #1, MUL VL]\n" + "cmp x20, #0x0\n" + "st1w { z17.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z16.s }, p1, [x24, #3, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #4\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x25, %x[in]\n" + "mov x24, %x[out]\n" + "add %x[in], x25, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x20, %x[width]\n" + "cntw x19, ALL, MUL #2\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1w { z17.s }, p1/Z, [x25]\n" + "sub x20, x20, x19\n" + "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n" + "addvl x25, x25, #2\n" + "cmp x20, x19\n" + "st1w { z17.s }, p1, [x24]\n" + "add x24, x24, %x[out_stride]\n" + "st1w { z16.s }, p1, [x24]\n" + "add x24, x24, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "whilelt p0.s, XZR, x20\n" + "ld1w { z16.s }, p0/Z, [x25]\n" + "addvl x25, x25, #1\n" + "decw x20\n" + "st1w { z16.s }, p1, [x24]\n" + "add x24, x24, %x[out_stride]\n" + "cmp x20, #0x0\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #1\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23" + ); +} + +} // anonymous namespace + +template<> +void Transform<1, 1, true, VLType::SVE>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_1VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 4, + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp new file mode 100644 index 0000000000..e4fb7ea4c1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 1 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "cmp %x[height], #0x8\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add %x[in], x21, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #2\n" + "cmp x20, x19\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1b { z17.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z3.b }, p1/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "ld1b { z21.b }, p1/Z, [x27]\n" + "cmp x20, x19\n" + "ld1b { z2.b }, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "ld1b { z16.b }, p1/Z, [x26]\n" + "zip1 z20.b, z17.b, z16.b\n" + "ld1b { z1.b }, p1/Z, [x26, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "zip2 z19.b, z17.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x25]\n" + "ld1b { z0.b }, p1/Z, [x25, #1, MUL VL]\n" + "zip1 z31.b, z3.b, z1.b\n" + "ld1b { z30.b }, p1/Z, [x24]\n" + "addvl x25, x25, #2\n" + "zip1 z16.b, z21.b, z17.b\n" + "ld1b { z29.b }, p1/Z, [x24, #1, MUL VL]\n" + "addvl x24, x24, #2\n" + "zip1 z18.b, z20.b, z16.b\n" + "ld1b { z28.b }, p1/Z, [x23]\n" + "zip2 z27.b, z20.b, z16.b\n" + "ld1b { z26.b }, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "zip2 z17.b, z21.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x22]\n" + "zip1 z25.b, z2.b, z0.b\n" + "ld1b { z24.b }, p1/Z, [x22, #1, MUL VL]\n" + "addvl x22, x22, #2\n" + "zip1 z23.b, z19.b, z17.b\n" + "ld1b { z22.b }, p1/Z, [x21]\n" + "zip2 z20.b, z19.b, z17.b\n" + "ld1b { z21.b }, p1/Z, [x21, #1, MUL VL]\n" + "addvl x21, x21, #2\n" + "zip1 z19.b, z30.b, z16.b\n" + "st1b { z18.b }, p1, [x28]\n" + "zip2 z18.b, z30.b, z16.b\n" + "zip1 z17.b, z28.b, z22.b\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z16.b, z19.b, z17.b\n" + "st1b { z27.b }, p1, [x28]\n" + "zip2 z17.b, z28.b, z22.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z23.b }, p1, [x28]\n" + "zip2 z17.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z16.b, z31.b, z25.b\n" + "st1b { z20.b }, p1, [x28]\n" + "zip1 z19.b, z29.b, z24.b\n" + "st1b { z17.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z18.b, z26.b, z21.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z17.b, z31.b, z25.b\n" + "zip1 z16.b, z19.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z16.b, z19.b, z18.b\n" + "st1b { z17.b }, p1, [x28]\n" + "zip2 z20.b, z3.b, z1.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z19.b, z2.b, z0.b\n" + "zip2 z18.b, z29.b, z24.b\n" + "zip1 z16.b, z20.b, z19.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z17.b, z26.b, z21.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z16.b, z20.b, z19.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x20, 5f\n" + "4:" // Main row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z18.b }, p0/Z, [x9]\n" + "incd x9, ALL, MUL #2\n" + "ld1b { z17.b }, p0/Z, [x27]\n" + "incd x27, ALL, MUL #2\n" + "ld1b { z16.b }, p0/Z, [x26]\n" + "zip1 z18.b, z18.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x25]\n" + "incd x26, ALL, MUL #2\n" + "zip1 z16.b, z17.b, z16.b\n" + "ld1b { z17.b }, p0/Z, [x24]\n" + "incd x25, ALL, MUL #2\n" + "zip1 z19.b, z18.b, z16.b\n" + "ld1b { z18.b }, p0/Z, [x23]\n" + "incd x24, ALL, MUL #2\n" + "ld1b { z16.b }, p0/Z, [x22]\n" + "zip1 z17.b, z17.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x21]\n" + "incd x23, ALL, MUL #2\n" + "zip1 z16.b, z18.b, z16.b\n" + "st1b { z19.b }, p1, [x28]\n" + "incd x22, ALL, MUL #2\n" + "zip1 z16.b, z17.b, z16.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "incd x21, ALL, MUL #2\n" + "add x28, x28, %x[out_stride]\n" + "decw x20\n" + "cmp x20, #0x0\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #2\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add %x[in], x25, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x25, x25, %x[pad_row], GT\n" + "csel x26, x26, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x27, x27, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #2\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1b { z19.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "ld1b { z25.b }, p1/Z, [x27]\n" + "cmp x20, x19\n" + "ld1b { z24.b }, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "ld1b { z17.b }, p1/Z, [x26]\n" + "zip1 z23.b, z19.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x26, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "zip2 z22.b, z19.b, z17.b\n" + "ld1b { z21.b }, p1/Z, [x25]\n" + "ld1b { z20.b }, p1/Z, [x25, #1, MUL VL]\n" + "zip1 z19.b, z18.b, z16.b\n" + "addvl x25, x25, #2\n" + "zip2 z18.b, z18.b, z16.b\n" + "zip1 z17.b, z25.b, z21.b\n" + "zip1 z16.b, z23.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z16.b, z23.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z17.b, z25.b, z21.b\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z16.b, z22.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z22.b, z17.b\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z17.b, z24.b, z20.b\n" + "st1b { z16.b }, p1, [x28]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z19.b, z17.b\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z17.b, z24.b, z20.b\n" + "st1b { z16.b }, p1, [x28]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "add x28, x28, %x[out_stride]\n" + "st1b { z16.b }, p1, [x28]\n" + "add x28, x28, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z17.b }, p0/Z, [x9]\n" + "incd x9, ALL, MUL #2\n" + "ld1b { z18.b }, p0/Z, [x27]\n" + "incd x27, ALL, MUL #2\n" + "ld1b { z16.b }, p0/Z, [x26]\n" + "zip1 z17.b, z17.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x25]\n" + "incd x26, ALL, MUL #2\n" + "zip1 z16.b, z18.b, z16.b\n" + "incd x25, ALL, MUL #2\n" + "decw x20\n" + "zip1 z16.b, z17.b, z16.b\n" + "st1b { z16.b }, p1, [x28]\n" + "add x28, x28, %x[out_stride]\n" + "cmp x20, #0x0\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #1\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<1, 4, true, VLType::SVE>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_1VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<1, 4, true, VLType::SVE>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_1VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp new file mode 100644 index 0000000000..0d694f3ec0 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 3 * height * get_vector_length(); + + __asm__ __volatile__( + "ptrue p2.b\n" + "cmp %x[height], #0x4\n" + "blt 4f\n" + "1:" // Main row loop: Head + "mov x26, %x[in]\n" + "mov x25, %x[out]\n" + "add x24, x26, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add %x[in], x22, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x21, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x20, x21\n" + "mov x19, x25\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z27.h }, p0/Z, [x26]\n" + "ld1h { z26.h }, p0/Z, [x24]\n" + "dech x20\n" + "ld1h { z25.h }, p0/Z, [x23]\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z24.h }, p0/Z, [x22]\n" + "dech x20\n" + "ld1h { z23.h }, p1/Z, [x26, #1, MUL VL]\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z22.h }, p1/Z, [x24, #1, MUL VL]\n" + "add x25, x25, %x[out_stride]\n" + "ld1h { z21.h }, p0/Z, [x26, #2, MUL VL]\n" + "addvl x26, x26, #3\n" + "ld1h { z20.h }, p0/Z, [x24, #2, MUL VL]\n" + "addvl x24, x24, #3\n" + "ld1h { z19.h }, p1/Z, [x23, #1, MUL VL]\n" + "dech x21, ALL, MUL #3\n" + "ld1h { z18.h }, p0/Z, [x23, #2, MUL VL]\n" + "addvl x23, x23, #3\n" + "ld1h { z17.h }, p1/Z, [x22, #1, MUL VL]\n" + "cmp x21, #0x0\n" + "ld1h { z16.h }, p0/Z, [x22, #2, MUL VL]\n" + "addvl x22, x22, #3\n" + "st1h { z27.h }, p2, [x19]\n" + "st1h { z23.h }, p2, [x19, #1, MUL VL]\n" + "st1h { z21.h }, p2, [x19, #2, MUL VL]\n" + "st1h { z26.h }, p2, [x19, #3, MUL VL]\n" + "st1h { z22.h }, p2, [x19, #4, MUL VL]\n" + "st1h { z20.h }, p2, [x19, #5, MUL VL]\n" + "st1h { z25.h }, p2, [x19, #6, MUL VL]\n" + "st1h { z19.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "st1h { z18.h }, p2, [x19, #-4, MUL VL]\n" + "st1h { z24.h }, p2, [x19, #-3, MUL VL]\n" + "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #12\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 8f\n" + "4:" // Main loop skip + + "5:" // Tail row loop: Head + "mov x26, %x[in]\n" + "mov x25, %x[out]\n" + "add %x[in], x26, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x20, %x[width]\n" + "6:" // Tail row loop: Column loop + "mov x19, x20\n" + "dech x20, ALL, MUL #3\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z18.h }, p0/Z, [x26]\n" + "dech x19\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x26, #1, MUL VL]\n" + "dech x19\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z16.h }, p0/Z, [x26, #2, MUL VL]\n" + "st1h { z18.h }, p2, [x25]\n" + "addvl x26, x26, #3\n" + "st1h { z17.h }, p2, [x25, #1, MUL VL]\n" + "cmp x20, #0x0\n" + "st1h { z16.h }, p2, [x25, #2, MUL VL]\n" + "add x25, x25, %x[out_stride]\n" + "bgt 6b\n" + "7:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #3\n" + "cmp %x[height], #0x1\n" + "bge 5b\n" + "8:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27" + ); +} + +} // anonymous namespace + +template<> +void Transform<3, 1, true, VLType::SVE>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_3VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 2, + stride * sizeof(float), + (kmax-k0) + ); +} + +template<> +void Transform<3, 1, true, VLType::SVE>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_3VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + +template<> +void Transform<3, 1, true, VLType::SVE>( + double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_3VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(double) / 2, + stride * sizeof(double), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp new file mode 100644 index 0000000000..15b32c804f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 3 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "cmp %x[height], #0x8\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add %x[in], x21, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #3\n" + "cmp x20, x19\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1b { z18.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z19.b }, p1/Z, [x9, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1b { z10.b }, p1/Z, [x9, #2, MUL VL]\n" + "addvl x9, x9, #3\n" + "ld1b { z24.b }, p1/Z, [x27]\n" + "ld1b { z23.b }, p1/Z, [x27, #1, MUL VL]\n" + "ld1b { z9.b }, p1/Z, [x27, #2, MUL VL]\n" + "addvl x27, x27, #3\n" + "ld1b { z16.b }, p1/Z, [x26]\n" + "zip1 z21.b, z18.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x26, #1, MUL VL]\n" + "zip2 z18.b, z18.b, z16.b\n" + "ld1b { z8.b }, p1/Z, [x26, #2, MUL VL]\n" + "addvl x26, x26, #3\n" + "zip1 z22.b, z19.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x25]\n" + "zip2 z7.b, z19.b, z17.b\n" + "ld1b { z20.b }, p1/Z, [x25, #1, MUL VL]\n" + "zip1 z6.b, z10.b, z8.b\n" + "ld1b { z5.b }, p1/Z, [x25, #2, MUL VL]\n" + "addvl x25, x25, #3\n" + "zip1 z17.b, z24.b, z16.b\n" + "ld1b { z19.b }, p1/Z, [x24]\n" + "zip2 z16.b, z24.b, z16.b\n" + "ld1b { z4.b }, p1/Z, [x24, #1, MUL VL]\n" + "zip1 z3.b, z21.b, z17.b\n" + "ld1b { z2.b }, p1/Z, [x24, #2, MUL VL]\n" + "addvl x24, x24, #3\n" + "zip2 z1.b, z21.b, z17.b\n" + "ld1b { z0.b }, p1/Z, [x23]\n" + "zip1 z31.b, z18.b, z16.b\n" + "ld1b { z30.b }, p1/Z, [x23, #1, MUL VL]\n" + "zip2 z29.b, z18.b, z16.b\n" + "ld1b { z28.b }, p1/Z, [x23, #2, MUL VL]\n" + "addvl x23, x23, #3\n" + "zip1 z18.b, z23.b, z20.b\n" + "ld1b { z17.b }, p1/Z, [x22]\n" + "zip2 z27.b, z23.b, z20.b\n" + "ld1b { z26.b }, p1/Z, [x22, #1, MUL VL]\n" + "zip1 z25.b, z22.b, z18.b\n" + "ld1b { z24.b }, p1/Z, [x22, #2, MUL VL]\n" + "addvl x22, x22, #3\n" + "zip1 z21.b, z19.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x21]\n" + "zip2 z19.b, z19.b, z17.b\n" + "ld1b { z23.b }, p1/Z, [x21, #1, MUL VL]\n" + "zip2 z20.b, z22.b, z18.b\n" + "ld1b { z22.b }, p1/Z, [x21, #2, MUL VL]\n" + "addvl x21, x21, #3\n" + "zip1 z17.b, z0.b, z16.b\n" + "st1b { z3.b }, p1, [x28]\n" + "zip2 z18.b, z0.b, z16.b\n" + "st1b { z1.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z21.b, z17.b\n" + "st1b { z31.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z17.b, z21.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z16.b, z19.b, z18.b\n" + "st1b { z17.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z19.b, z19.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z18.b, z4.b, z26.b\n" + "st1b { z29.b }, p1, [x28]\n" + "zip1 z17.b, z30.b, z23.b\n" + "st1b { z25.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z20.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z18.b, z18.b, z17.b\n" + "st1b { z19.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z17.b, z7.b, z27.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z7.b, z27.b\n" + "st1b { z18.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z21.b, z9.b, z5.b\n" + "st1b { z17.b }, p1, [x28]\n" + "zip2 z18.b, z4.b, z26.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z6.b, z21.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z17.b, z30.b, z23.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip1 z20.b, z2.b, z24.b\n" + "zip1 z19.b, z28.b, z22.b\n" + "zip1 z16.b, z20.b, z19.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z16.b, z6.b, z21.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z18.b, z10.b, z8.b\n" + "zip2 z17.b, z9.b, z5.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z20.b, z19.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip2 z18.b, z2.b, z24.b\n" + "zip2 z17.b, z28.b, z22.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x20, 5f\n" + "4:" // Main row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z18.b }, p0/Z, [x9]\n" + "incd x9, ALL, MUL #6\n" + "ld1b { z25.b }, p0/Z, [x27]\n" + "incd x27, ALL, MUL #6\n" + "ld1b { z16.b }, p0/Z, [x26]\n" + "zip1 z17.b, z18.b, z16.b\n" + "ld1b { z24.b }, p0/Z, [x25]\n" + "incd x26, ALL, MUL #6\n" + "zip2 z19.b, z18.b, z16.b\n" + "ld1b { z23.b }, p0/Z, [x24]\n" + "incd x25, ALL, MUL #6\n" + "zip1 z16.b, z25.b, z24.b\n" + "ld1b { z22.b }, p0/Z, [x23]\n" + "incd x24, ALL, MUL #6\n" + "zip1 z18.b, z17.b, z16.b\n" + "ld1b { z21.b }, p0/Z, [x22]\n" + "incd x23, ALL, MUL #6\n" + "zip2 z17.b, z17.b, z16.b\n" + "ld1b { z20.b }, p0/Z, [x21]\n" + "incd x22, ALL, MUL #6\n" + "zip2 z16.b, z25.b, z24.b\n" + "st1b { z18.b }, p1, [x28]\n" + "incd x21, ALL, MUL #6\n" + "zip1 z16.b, z19.b, z16.b\n" + "st1b { z17.b }, p1, [x28, #1, MUL VL]\n" + "decw x20, ALL, MUL #3\n" + "zip1 z19.b, z23.b, z21.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "cmp x20, #0x0\n" + "zip1 z18.b, z22.b, z20.b\n" + "zip2 z17.b, z23.b, z21.b\n" + "zip1 z16.b, z19.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip2 z16.b, z19.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z22.b, z20.b\n" + "zip1 z16.b, z17.b, z16.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #6\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add %x[in], x25, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x25, x25, %x[pad_row], GT\n" + "csel x26, x26, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x27, x27, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #3\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1b { z19.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1b { z30.b }, p1/Z, [x9, #2, MUL VL]\n" + "addvl x9, x9, #3\n" + "ld1b { z29.b }, p1/Z, [x27]\n" + "ld1b { z28.b }, p1/Z, [x27, #1, MUL VL]\n" + "ld1b { z27.b }, p1/Z, [x27, #2, MUL VL]\n" + "addvl x27, x27, #3\n" + "ld1b { z16.b }, p1/Z, [x26]\n" + "zip1 z26.b, z19.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x26, #1, MUL VL]\n" + "zip2 z25.b, z19.b, z16.b\n" + "ld1b { z24.b }, p1/Z, [x26, #2, MUL VL]\n" + "addvl x26, x26, #3\n" + "zip1 z23.b, z18.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x25]\n" + "zip2 z22.b, z18.b, z17.b\n" + "ld1b { z21.b }, p1/Z, [x25, #1, MUL VL]\n" + "zip1 z20.b, z30.b, z24.b\n" + "ld1b { z19.b }, p1/Z, [x25, #2, MUL VL]\n" + "addvl x25, x25, #3\n" + "zip1 z18.b, z29.b, z16.b\n" + "zip2 z17.b, z29.b, z16.b\n" + "zip1 z16.b, z26.b, z18.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z26.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z25.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z16.b, z25.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip1 z18.b, z28.b, z21.b\n" + "zip2 z17.b, z28.b, z21.b\n" + "zip1 z16.b, z23.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z16.b, z23.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z16.b, z22.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z22.b, z17.b\n" + "zip1 z17.b, z27.b, z19.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z20.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z16.b, z20.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z18.b, z30.b, z24.b\n" + "zip2 z17.b, z27.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z18.b }, p0/Z, [x9]\n" + "incd x9, ALL, MUL #6\n" + "ld1b { z21.b }, p0/Z, [x27]\n" + "incd x27, ALL, MUL #6\n" + "ld1b { z17.b }, p0/Z, [x26]\n" + "zip1 z20.b, z18.b, z17.b\n" + "ld1b { z16.b }, p0/Z, [x25]\n" + "incd x26, ALL, MUL #6\n" + "zip2 z19.b, z18.b, z17.b\n" + "incd x25, ALL, MUL #6\n" + "decw x20, ALL, MUL #3\n" + "zip1 z18.b, z21.b, z16.b\n" + "cmp x20, #0x0\n" + "zip2 z17.b, z21.b, z16.b\n" + "zip1 z16.b, z20.b, z18.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z20.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #3\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<3, 4, true, VLType::SVE>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_3VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<3, 4, true, VLType::SVE>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_3VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp new file mode 100644 index 0000000000..1864a16758 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 3 * roundup(height, 2) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p2.b\n" + "cmp %x[height], #0x8\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add %x[in], x23, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x22, %x[width]\n" + "cnth x21, ALL, MUL #3\n" + "cmp x22, x21\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1h { z19.h }, p2/Z, [x11]\n" + "mov x20, x10\n" + "ld1h { z18.h }, p2/Z, [x11, #1, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "ld1h { z21.h }, p2/Z, [x11, #2, MUL VL]\n" + "addvl x11, x11, #3\n" + "ld1h { z16.h }, p2/Z, [x9]\n" + "zip1 z9.h, z19.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x9, #1, MUL VL]\n" + "mov x19, x10\n" + "zip2 z8.h, z19.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x9, #2, MUL VL]\n" + "addvl x9, x9, #3\n" + "zip1 z7.h, z18.h, z17.h\n" + "ld1h { z19.h }, p2/Z, [x28]\n" + "add x10, x10, %x[out_stride]\n" + "zip2 z6.h, z18.h, z17.h\n" + "ld1h { z20.h }, p2/Z, [x28, #1, MUL VL]\n" + "sub x22, x22, x21\n" + "zip1 z5.h, z21.h, z16.h\n" + "ld1h { z18.h }, p2/Z, [x28, #2, MUL VL]\n" + "addvl x28, x28, #3\n" + "zip2 z4.h, z21.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x27]\n" + "cmp x22, x21\n" + "zip1 z3.h, z19.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x27, #1, MUL VL]\n" + "zip2 z2.h, z19.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x27, #2, MUL VL]\n" + "addvl x27, x27, #3\n" + "zip1 z1.h, z20.h, z17.h\n" + "ld1h { z19.h }, p2/Z, [x26]\n" + "zip2 z0.h, z20.h, z17.h\n" + "ld1h { z21.h }, p2/Z, [x26, #1, MUL VL]\n" + "zip1 z31.h, z18.h, z16.h\n" + "ld1h { z20.h }, p2/Z, [x26, #2, MUL VL]\n" + "addvl x26, x26, #3\n" + "zip2 z30.h, z18.h, z16.h\n" + "ld1h { z18.h }, p2/Z, [x25]\n" + "ld1h { z17.h }, p2/Z, [x25, #1, MUL VL]\n" + "zip1 z29.h, z19.h, z18.h\n" + "ld1h { z16.h }, p2/Z, [x25, #2, MUL VL]\n" + "addvl x25, x25, #3\n" + "zip2 z28.h, z19.h, z18.h\n" + "ld1h { z19.h }, p2/Z, [x24]\n" + "zip1 z27.h, z21.h, z17.h\n" + "ld1h { z26.h }, p2/Z, [x24, #1, MUL VL]\n" + "zip2 z25.h, z21.h, z17.h\n" + "ld1h { z24.h }, p2/Z, [x24, #2, MUL VL]\n" + "addvl x24, x24, #3\n" + "zip1 z23.h, z20.h, z16.h\n" + "ld1h { z18.h }, p2/Z, [x23]\n" + "zip2 z22.h, z20.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z16.h }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z21.h, z19.h, z18.h\n" + "st1h { z9.h }, p2, [x20]\n" + "addvl x23, x23, #3\n" + "zip2 z20.h, z19.h, z18.h\n" + "st1h { z8.h }, p2, [x20, #1, MUL VL]\n" + "zip1 z19.h, z26.h, z17.h\n" + "st1h { z7.h }, p2, [x20, #2, MUL VL]\n" + "zip2 z18.h, z26.h, z17.h\n" + "st1h { z3.h }, p2, [x20, #3, MUL VL]\n" + "zip1 z17.h, z24.h, z16.h\n" + "st1h { z2.h }, p2, [x20, #4, MUL VL]\n" + "zip2 z16.h, z24.h, z16.h\n" + "st1h { z1.h }, p2, [x20, #5, MUL VL]\n" + "st1h { z29.h }, p2, [x20, #6, MUL VL]\n" + "st1h { z28.h }, p2, [x20, #7, MUL VL]\n" + "addvl x20, x20, #12\n" + "st1h { z27.h }, p2, [x20, #-4, MUL VL]\n" + "st1h { z21.h }, p2, [x20, #-3, MUL VL]\n" + "st1h { z20.h }, p2, [x20, #-2, MUL VL]\n" + "st1h { z19.h }, p2, [x20, #-1, MUL VL]\n" + "st1h { z6.h }, p2, [x19]\n" + "st1h { z5.h }, p2, [x19, #1, MUL VL]\n" + "st1h { z4.h }, p2, [x19, #2, MUL VL]\n" + "st1h { z0.h }, p2, [x19, #3, MUL VL]\n" + "st1h { z31.h }, p2, [x19, #4, MUL VL]\n" + "st1h { z30.h }, p2, [x19, #5, MUL VL]\n" + "st1h { z25.h }, p2, [x19, #6, MUL VL]\n" + "st1h { z23.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "st1h { z22.h }, p2, [x19, #-4, MUL VL]\n" + "st1h { z18.h }, p2, [x19, #-3, MUL VL]\n" + "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x20, x22\n" + "mov x19, x10\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z17.h }, p1/Z, [x11]\n" + "ld1h { z16.h }, p1/Z, [x9]\n" + "zip1 z29.h, z17.h, z16.h\n" + "ld1h { z18.h }, p1/Z, [x28]\n" + "dech x20\n" + "zip2 z28.h, z17.h, z16.h\n" + "ld1h { z16.h }, p1/Z, [x27]\n" + "whilelt p0.h, XZR, x20\n" + "zip1 z27.h, z18.h, z16.h\n" + "ld1h { z17.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #1\n" + "zip2 z26.h, z18.h, z16.h\n" + "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n" + "incd x11, ALL, MUL #4\n" + "zip1 z25.h, z17.h, z16.h\n" + "ld1h { z17.h }, p0/Z, [x28, #1, MUL VL]\n" + "addvl x9, x9, #1\n" + "ld1h { z16.h }, p0/Z, [x27, #1, MUL VL]\n" + "zip1 z24.h, z17.h, z16.h\n" + "ld1h { z19.h }, p1/Z, [x26]\n" + "incd x9, ALL, MUL #4\n" + "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n" + "addvl x28, x28, #1\n" + "ld1h { z17.h }, p1/Z, [x25]\n" + "zip1 z23.h, z19.h, z17.h\n" + "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n" + "incd x28, ALL, MUL #4\n" + "zip2 z22.h, z19.h, z17.h\n" + "ld1h { z21.h }, p1/Z, [x24]\n" + "addvl x27, x27, #1\n" + "zip1 z20.h, z18.h, z16.h\n" + "ld1h { z19.h }, p0/Z, [x24, #1, MUL VL]\n" + "incd x27, ALL, MUL #4\n" + "ld1h { z17.h }, p1/Z, [x23]\n" + "zip1 z18.h, z21.h, z17.h\n" + "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n" + "addvl x26, x26, #1\n" + "zip2 z17.h, z21.h, z17.h\n" + "st1h { z29.h }, p2, [x19]\n" + "incd x26, ALL, MUL #4\n" + "zip1 z16.h, z19.h, z16.h\n" + "st1h { z28.h }, p2, [x19, #1, MUL VL]\n" + "addvl x25, x25, #1\n" + "st1h { z25.h }, p2, [x19, #2, MUL VL]\n" + "incd x25, ALL, MUL #4\n" + "st1h { z27.h }, p2, [x19, #3, MUL VL]\n" + "addvl x24, x24, #1\n" + "st1h { z26.h }, p2, [x19, #4, MUL VL]\n" + "incd x24, ALL, MUL #4\n" + "st1h { z24.h }, p2, [x19, #5, MUL VL]\n" + "addvl x23, x23, #1\n" + "st1h { z23.h }, p2, [x19, #6, MUL VL]\n" + "incd x23, ALL, MUL #4\n" + "st1h { z22.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "add x10, x10, %x[out_stride]\n" + "st1h { z20.h }, p2, [x19, #-4, MUL VL]\n" + "st1h { z18.h }, p2, [x19, #-3, MUL VL]\n" + "decw x22, ALL, MUL #3\n" + "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n" + "cmp x22, #0x0\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #12\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add %x[in], x9, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "csel x9, x9, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x20, %x[width]\n" + "cnth x19, ALL, MUL #3\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1h { z17.h }, p2/Z, [x11]\n" + "sub x20, x20, x19\n" + "ld1h { z22.h }, p2/Z, [x11, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1h { z21.h }, p2/Z, [x11, #2, MUL VL]\n" + "addvl x11, x11, #3\n" + "ld1h { z16.h }, p2/Z, [x9]\n" + "zip1 z20.h, z17.h, z16.h\n" + "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n" + "zip2 z17.h, z17.h, z16.h\n" + "ld1h { z19.h }, p2/Z, [x9, #2, MUL VL]\n" + "addvl x9, x9, #3\n" + "zip1 z16.h, z22.h, z18.h\n" + "st1h { z20.h }, p2, [x10]\n" + "zip2 z18.h, z22.h, z18.h\n" + "st1h { z17.h }, p2, [x10, #1, MUL VL]\n" + "zip1 z17.h, z21.h, z19.h\n" + "st1h { z16.h }, p2, [x10, #2, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "zip2 z16.h, z21.h, z19.h\n" + "st1h { z18.h }, p2, [x10]\n" + "st1h { z17.h }, p2, [x10, #1, MUL VL]\n" + "st1h { z16.h }, p2, [x10, #2, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "mov x19, x20\n" + "decw x20, ALL, MUL #3\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x11]\n" + "ld1h { z16.h }, p0/Z, [x9]\n" + "zip1 z19.h, z17.h, z16.h\n" + "dech x19\n" + "zip2 z18.h, z17.h, z16.h\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #1\n" + "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n" + "zip1 z16.h, z17.h, z16.h\n" + "st1h { z19.h }, p2, [x10]\n" + "incd x11, ALL, MUL #4\n" + "st1h { z18.h }, p2, [x10, #1, MUL VL]\n" + "addvl x9, x9, #1\n" + "st1h { z16.h }, p2, [x10, #2, MUL VL]\n" + "incd x9, ALL, MUL #4\n" + "add x10, x10, %x[out_stride]\n" + "cmp x20, #0x0\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #3\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<3, 2, true, VLType::SVE>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_3VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp new file mode 100644 index 0000000000..aa9d7220fe --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 4 * height * get_vector_length(); + + __asm__ __volatile__( + "ptrue p3.b\n" + "cmp %x[height], #0x4\n" + "blt 4f\n" + "1:" // Main row loop: Head + "mov x26, %x[in]\n" + "mov x25, %x[out]\n" + "add x24, x26, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add %x[in], x22, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x21, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x20, x21\n" + "mov x19, x25\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z31.h }, p0/Z, [x26]\n" + "ld1h { z30.h }, p0/Z, [x24]\n" + "dech x20\n" + "ld1h { z29.h }, p0/Z, [x23]\n" + "whilelt p2.h, XZR, x20\n" + "ld1h { z28.h }, p0/Z, [x22]\n" + "dech x20\n" + "ld1h { z27.h }, p2/Z, [x26, #1, MUL VL]\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z26.h }, p2/Z, [x24, #1, MUL VL]\n" + "dech x20\n" + "ld1h { z25.h }, p1/Z, [x26, #2, MUL VL]\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z24.h }, p1/Z, [x24, #2, MUL VL]\n" + "add x25, x25, %x[out_stride]\n" + "ld1h { z23.h }, p0/Z, [x26, #3, MUL VL]\n" + "addvl x26, x26, #4\n" + "ld1h { z22.h }, p0/Z, [x24, #3, MUL VL]\n" + "addvl x24, x24, #4\n" + "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n" + "dech x21, ALL, MUL #4\n" + "ld1h { z20.h }, p1/Z, [x23, #2, MUL VL]\n" + "cmp x21, #0x0\n" + "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "ld1h { z18.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z17.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z16.h }, p0/Z, [x22, #3, MUL VL]\n" + "addvl x22, x22, #4\n" + "st1h { z31.h }, p3, [x19]\n" + "st1h { z27.h }, p3, [x19, #1, MUL VL]\n" + "st1h { z25.h }, p3, [x19, #2, MUL VL]\n" + "st1h { z23.h }, p3, [x19, #3, MUL VL]\n" + "st1h { z30.h }, p3, [x19, #4, MUL VL]\n" + "st1h { z26.h }, p3, [x19, #5, MUL VL]\n" + "st1h { z24.h }, p3, [x19, #6, MUL VL]\n" + "st1h { z22.h }, p3, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "st1h { z29.h }, p3, [x19, #-8, MUL VL]\n" + "st1h { z21.h }, p3, [x19, #-7, MUL VL]\n" + "st1h { z20.h }, p3, [x19, #-6, MUL VL]\n" + "st1h { z19.h }, p3, [x19, #-5, MUL VL]\n" + "st1h { z28.h }, p3, [x19, #-4, MUL VL]\n" + "st1h { z18.h }, p3, [x19, #-3, MUL VL]\n" + "st1h { z17.h }, p3, [x19, #-2, MUL VL]\n" + "st1h { z16.h }, p3, [x19, #-1, MUL VL]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #16\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 8f\n" + "4:" // Main loop skip + + "5:" // Tail row loop: Head + "mov x26, %x[in]\n" + "mov x25, %x[out]\n" + "add %x[in], x26, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x20, %x[width]\n" + "6:" // Tail row loop: Column loop + "mov x19, x20\n" + "dech x20, ALL, MUL #4\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z19.h }, p0/Z, [x26]\n" + "dech x19\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n" + "dech x19\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x26, #2, MUL VL]\n" + "dech x19\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z16.h }, p0/Z, [x26, #3, MUL VL]\n" + "st1h { z19.h }, p3, [x25]\n" + "addvl x26, x26, #4\n" + "st1h { z18.h }, p3, [x25, #1, MUL VL]\n" + "cmp x20, #0x0\n" + "st1h { z17.h }, p3, [x25, #2, MUL VL]\n" + "st1h { z16.h }, p3, [x25, #3, MUL VL]\n" + "add x25, x25, %x[out_stride]\n" + "bgt 6b\n" + "7:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #4\n" + "cmp %x[height], #0x1\n" + "bge 5b\n" + "8:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<4, 1, true, VLType::SVE>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_4VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 2, + stride * sizeof(float), + (kmax-k0) + ); +} + +template<> +void Transform<4, 1, true, VLType::SVE>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_4VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + +template<> +void Transform<4, 1, true, VLType::SVE>( + double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_4VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(double) / 2, + stride * sizeof(double), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp new file mode 100644 index 0000000000..5e5f7a53a7 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_4VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 4 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "cmp %x[height], #0x8\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add %x[in], x21, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #2\n" + "cmp x20, x19\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1b { z17.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z3.b }, p1/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "ld1b { z20.b }, p1/Z, [x27]\n" + "cmp x20, x19\n" + "ld1b { z2.b }, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "ld1b { z16.b }, p1/Z, [x26]\n" + "zip1 z18.b, z17.b, z16.b\n" + "ld1b { z1.b }, p1/Z, [x26, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "zip2 z19.b, z17.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x25]\n" + "ld1b { z0.b }, p1/Z, [x25, #1, MUL VL]\n" + "zip1 z31.b, z3.b, z1.b\n" + "ld1b { z30.b }, p1/Z, [x24]\n" + "addvl x25, x25, #2\n" + "zip1 z16.b, z20.b, z17.b\n" + "ld1b { z29.b }, p1/Z, [x24, #1, MUL VL]\n" + "addvl x24, x24, #2\n" + "zip1 z28.b, z18.b, z16.b\n" + "ld1b { z27.b }, p1/Z, [x23]\n" + "zip2 z26.b, z18.b, z16.b\n" + "ld1b { z25.b }, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "zip2 z18.b, z20.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x22]\n" + "zip1 z24.b, z2.b, z0.b\n" + "ld1b { z23.b }, p1/Z, [x22, #1, MUL VL]\n" + "addvl x22, x22, #2\n" + "zip1 z17.b, z19.b, z18.b\n" + "ld1b { z22.b }, p1/Z, [x21]\n" + "zip2 z21.b, z19.b, z18.b\n" + "ld1b { z20.b }, p1/Z, [x21, #1, MUL VL]\n" + "addvl x21, x21, #2\n" + "zip1 z19.b, z30.b, z16.b\n" + "st1b { z28.b }, p1, [x28]\n" + "zip2 z18.b, z30.b, z16.b\n" + "st1b { z26.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z27.b, z22.b\n" + "st1b { z17.b }, p1, [x28, #2, MUL VL]\n" + "zip1 z17.b, z19.b, z16.b\n" + "st1b { z21.b }, p1, [x28, #3, MUL VL]\n" + "zip2 z16.b, z19.b, z16.b\n" + "st1b { z17.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z17.b, z27.b, z22.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #7, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z16.b, z31.b, z24.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z31.b, z24.b\n" + "zip2 z18.b, z3.b, z1.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z17.b, z2.b, z0.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z18.b, z29.b, z23.b\n" + "zip1 z17.b, z25.b, z20.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "zip2 z18.b, z29.b, z23.b\n" + "zip2 z17.b, z25.b, z20.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #7, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x20, 5f\n" + "4:" // Main row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z17.b }, p0/Z, [x9]\n" + "addvl x9, x9, #1\n" + "ld1b { z25.b }, p0/Z, [x27]\n" + "addvl x27, x27, #1\n" + "ld1b { z16.b }, p0/Z, [x26]\n" + "zip1 z18.b, z17.b, z16.b\n" + "ld1b { z24.b }, p0/Z, [x25]\n" + "addvl x26, x26, #1\n" + "zip2 z23.b, z17.b, z16.b\n" + "ld1b { z22.b }, p0/Z, [x24]\n" + "addvl x25, x25, #1\n" + "zip1 z16.b, z25.b, z24.b\n" + "ld1b { z21.b }, p0/Z, [x23]\n" + "addvl x24, x24, #1\n" + "zip1 z17.b, z18.b, z16.b\n" + "ld1b { z20.b }, p0/Z, [x22]\n" + "addvl x23, x23, #1\n" + "zip2 z18.b, z18.b, z16.b\n" + "ld1b { z19.b }, p0/Z, [x21]\n" + "addvl x22, x22, #1\n" + "zip2 z16.b, z25.b, z24.b\n" + "st1b { z17.b }, p1, [x28]\n" + "addvl x21, x21, #1\n" + "zip1 z17.b, z23.b, z16.b\n" + "st1b { z18.b }, p1, [x28, #1, MUL VL]\n" + "decw x20, ALL, MUL #4\n" + "zip2 z16.b, z23.b, z16.b\n" + "st1b { z17.b }, p1, [x28, #2, MUL VL]\n" + "cmp x20, #0x0\n" + "zip1 z18.b, z22.b, z20.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z17.b, z21.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "zip2 z18.b, z22.b, z20.b\n" + "zip2 z17.b, z21.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #7, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #8\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add %x[in], x25, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x25, x25, %x[pad_row], GT\n" + "csel x26, x26, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x27, x27, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #2\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1b { z18.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z19.b }, p1/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "ld1b { z25.b }, p1/Z, [x27]\n" + "cmp x20, x19\n" + "ld1b { z24.b }, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "ld1b { z17.b }, p1/Z, [x26]\n" + "zip1 z23.b, z18.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x26, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "zip2 z22.b, z18.b, z17.b\n" + "ld1b { z18.b }, p1/Z, [x25]\n" + "ld1b { z21.b }, p1/Z, [x25, #1, MUL VL]\n" + "zip1 z20.b, z19.b, z16.b\n" + "addvl x25, x25, #2\n" + "zip2 z19.b, z19.b, z16.b\n" + "zip1 z17.b, z25.b, z18.b\n" + "zip1 z16.b, z23.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z23.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z17.b, z25.b, z18.b\n" + "zip1 z16.b, z22.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z22.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z18.b, z24.b, z21.b\n" + "zip2 z17.b, z24.b, z21.b\n" + "zip1 z16.b, z20.b, z18.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z20.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z18.b }, p0/Z, [x9]\n" + "addvl x9, x9, #1\n" + "ld1b { z21.b }, p0/Z, [x27]\n" + "addvl x27, x27, #1\n" + "ld1b { z17.b }, p0/Z, [x26]\n" + "zip1 z20.b, z18.b, z17.b\n" + "ld1b { z16.b }, p0/Z, [x25]\n" + "addvl x26, x26, #1\n" + "zip2 z19.b, z18.b, z17.b\n" + "addvl x25, x25, #1\n" + "decw x20, ALL, MUL #4\n" + "zip1 z18.b, z21.b, z16.b\n" + "cmp x20, #0x0\n" + "zip2 z17.b, z21.b, z16.b\n" + "zip1 z16.b, z20.b, z18.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z20.b, z18.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #4\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<4, 4, true, VLType::SVE>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_4VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<4, 4, true, VLType::SVE>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_4VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp new file mode 100644 index 0000000000..48040f9edb --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 4 * roundup(height, 2) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p2.b\n" + "cmp %x[height], #0x8\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add %x[in], x23, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x22, %x[width]\n" + "cnth x21, ALL, MUL #4\n" + "cmp x22, x21\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1h { z21.h }, p2/Z, [x11]\n" + "mov x20, x10\n" + "ld1h { z19.h }, p2/Z, [x11, #1, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "ld1h { z23.h }, p2/Z, [x11, #2, MUL VL]\n" + "mov x19, x10\n" + "ld1h { z31.h }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "ld1h { z29.h }, p2/Z, [x9]\n" + "zip1 z0.h, z21.h, z29.h\n" + "ld1h { z17.h }, p2/Z, [x9, #1, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "zip2 z22.h, z21.h, z29.h\n" + "ld1h { z15.h }, p2/Z, [x9, #2, MUL VL]\n" + "sub x22, x22, x21\n" + "zip1 z13.h, z19.h, z17.h\n" + "ld1h { z6.h }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "zip2 z12.h, z19.h, z17.h\n" + "ld1h { z20.h }, p2/Z, [x28]\n" + "cmp x22, x21\n" + "zip1 z14.h, z23.h, z15.h\n" + "ld1h { z1.h }, p2/Z, [x28, #1, MUL VL]\n" + "zip2 z3.h, z23.h, z15.h\n" + "ld1h { z19.h }, p2/Z, [x28, #2, MUL VL]\n" + "zip1 z16.h, z31.h, z6.h\n" + "ld1h { z11.h }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "zip2 z10.h, z31.h, z6.h\n" + "ld1h { z27.h }, p2/Z, [x27]\n" + "ld1h { z18.h }, p2/Z, [x27, #1, MUL VL]\n" + "zip1 z9.h, z20.h, z27.h\n" + "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n" + "zip2 z24.h, z20.h, z27.h\n" + "ld1h { z5.h }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "zip1 z8.h, z1.h, z18.h\n" + "ld1h { z30.h }, p2/Z, [x26]\n" + "zip2 z17.h, z1.h, z18.h\n" + "ld1h { z28.h }, p2/Z, [x26, #1, MUL VL]\n" + "zip1 z6.h, z19.h, z2.h\n" + "ld1h { z23.h }, p2/Z, [x26, #2, MUL VL]\n" + "zip2 z1.h, z19.h, z2.h\n" + "ld1h { z25.h }, p2/Z, [x26, #3, MUL VL]\n" + "addvl x26, x26, #4\n" + "zip1 z31.h, z11.h, z5.h\n" + "ld1h { z21.h }, p2/Z, [x25]\n" + "zip2 z11.h, z11.h, z5.h\n" + "ld1h { z19.h }, p2/Z, [x25, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x25, #2, MUL VL]\n" + "zip1 z29.h, z30.h, z21.h\n" + "ld1h { z26.h }, p2/Z, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "zip2 z30.h, z30.h, z21.h\n" + "ld1h { z21.h }, p2/Z, [x24]\n" + "zip1 z27.h, z28.h, z19.h\n" + "ld1h { z20.h }, p2/Z, [x24, #1, MUL VL]\n" + "zip2 z28.h, z28.h, z19.h\n" + "ld1h { z4.h }, p2/Z, [x24, #2, MUL VL]\n" + "zip1 z2.h, z23.h, z18.h\n" + "ld1h { z15.h }, p2/Z, [x24, #3, MUL VL]\n" + "addvl x24, x24, #4\n" + "zip2 z5.h, z23.h, z18.h\n" + "ld1h { z23.h }, p2/Z, [x23]\n" + "zip1 z7.h, z25.h, z26.h\n" + "ld1h { z19.h }, p2/Z, [x23, #1, MUL VL]\n" + "zip2 z25.h, z25.h, z26.h\n" + "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "st1h { z0.h }, p2, [x20]\n" + "zip1 z0.h, z21.h, z23.h\n" + "zip2 z23.h, z21.h, z23.h\n" + "st1h { z22.h }, p2, [x20, #1, MUL VL]\n" + "zip1 z22.h, z20.h, z19.h\n" + "st1h { z13.h }, p2, [x20, #2, MUL VL]\n" + "zip2 z21.h, z20.h, z19.h\n" + "st1h { z12.h }, p2, [x20, #3, MUL VL]\n" + "zip1 z20.h, z4.h, z18.h\n" + "st1h { z9.h }, p2, [x20, #4, MUL VL]\n" + "zip2 z19.h, z4.h, z18.h\n" + "st1h { z24.h }, p2, [x20, #5, MUL VL]\n" + "zip1 z18.h, z15.h, z26.h\n" + "st1h { z8.h }, p2, [x20, #6, MUL VL]\n" + "zip2 z9.h, z15.h, z26.h\n" + "st1h { z17.h }, p2, [x20, #7, MUL VL]\n" + "addvl x20, x20, #16\n" + "st1h { z29.h }, p2, [x20, #-8, MUL VL]\n" + "st1h { z30.h }, p2, [x20, #-7, MUL VL]\n" + "st1h { z27.h }, p2, [x20, #-6, MUL VL]\n" + "st1h { z28.h }, p2, [x20, #-5, MUL VL]\n" + "st1h { z0.h }, p2, [x20, #-4, MUL VL]\n" + "st1h { z23.h }, p2, [x20, #-3, MUL VL]\n" + "st1h { z22.h }, p2, [x20, #-2, MUL VL]\n" + "st1h { z21.h }, p2, [x20, #-1, MUL VL]\n" + "st1h { z14.h }, p2, [x19]\n" + "st1h { z3.h }, p2, [x19, #1, MUL VL]\n" + "st1h { z16.h }, p2, [x19, #2, MUL VL]\n" + "st1h { z10.h }, p2, [x19, #3, MUL VL]\n" + "st1h { z6.h }, p2, [x19, #4, MUL VL]\n" + "st1h { z1.h }, p2, [x19, #5, MUL VL]\n" + "st1h { z31.h }, p2, [x19, #6, MUL VL]\n" + "st1h { z11.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "st1h { z2.h }, p2, [x19, #-8, MUL VL]\n" + "st1h { z5.h }, p2, [x19, #-7, MUL VL]\n" + "st1h { z7.h }, p2, [x19, #-6, MUL VL]\n" + "st1h { z25.h }, p2, [x19, #-5, MUL VL]\n" + "st1h { z20.h }, p2, [x19, #-4, MUL VL]\n" + "st1h { z19.h }, p2, [x19, #-3, MUL VL]\n" + "st1h { z18.h }, p2, [x19, #-2, MUL VL]\n" + "st1h { z9.h }, p2, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x20, x22\n" + "mov x19, x10\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z18.h }, p1/Z, [x11]\n" + "ld1h { z16.h }, p1/Z, [x9]\n" + "zip1 z0.h, z18.h, z16.h\n" + "ld1h { z17.h }, p1/Z, [x28]\n" + "dech x20\n" + "zip2 z31.h, z18.h, z16.h\n" + "ld1h { z16.h }, p1/Z, [x27]\n" + "whilelt p0.h, XZR, x20\n" + "zip1 z30.h, z17.h, z16.h\n" + "ld1h { z18.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #2\n" + "zip2 z29.h, z17.h, z16.h\n" + "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "zip1 z28.h, z18.h, z16.h\n" + "ld1h { z17.h }, p0/Z, [x28, #1, MUL VL]\n" + "addvl x28, x28, #2\n" + "zip2 z27.h, z18.h, z16.h\n" + "ld1h { z16.h }, p0/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "zip1 z26.h, z17.h, z16.h\n" + "ld1h { z18.h }, p1/Z, [x26]\n" + "add x10, x10, %x[out_stride]\n" + "zip2 z25.h, z17.h, z16.h\n" + "ld1h { z19.h }, p0/Z, [x26, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "ld1h { z17.h }, p1/Z, [x25]\n" + "zip1 z24.h, z18.h, z17.h\n" + "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n" + "addvl x25, x25, #2\n" + "zip2 z23.h, z18.h, z17.h\n" + "ld1h { z18.h }, p1/Z, [x24]\n" + "decw x22, ALL, MUL #4\n" + "zip1 z22.h, z19.h, z16.h\n" + "ld1h { z21.h }, p0/Z, [x24, #1, MUL VL]\n" + "addvl x24, x24, #2\n" + "zip2 z20.h, z19.h, z16.h\n" + "ld1h { z17.h }, p1/Z, [x23]\n" + "cmp x22, #0x0\n" + "zip1 z19.h, z18.h, z17.h\n" + "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "zip2 z18.h, z18.h, z17.h\n" + "st1h { z0.h }, p2, [x19]\n" + "st1h { z31.h }, p2, [x19, #1, MUL VL]\n" + "zip1 z17.h, z21.h, z16.h\n" + "st1h { z28.h }, p2, [x19, #2, MUL VL]\n" + "zip2 z16.h, z21.h, z16.h\n" + "st1h { z27.h }, p2, [x19, #3, MUL VL]\n" + "st1h { z30.h }, p2, [x19, #4, MUL VL]\n" + "st1h { z29.h }, p2, [x19, #5, MUL VL]\n" + "st1h { z26.h }, p2, [x19, #6, MUL VL]\n" + "st1h { z25.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "st1h { z24.h }, p2, [x19, #-8, MUL VL]\n" + "st1h { z23.h }, p2, [x19, #-7, MUL VL]\n" + "st1h { z22.h }, p2, [x19, #-6, MUL VL]\n" + "st1h { z20.h }, p2, [x19, #-5, MUL VL]\n" + "st1h { z19.h }, p2, [x19, #-4, MUL VL]\n" + "st1h { z18.h }, p2, [x19, #-3, MUL VL]\n" + "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #16\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add %x[in], x9, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "csel x9, x9, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x20, %x[width]\n" + "cnth x19, ALL, MUL #4\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1h { z18.h }, p2/Z, [x11]\n" + "sub x20, x20, x19\n" + "ld1h { z24.h }, p2/Z, [x11, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1h { z23.h }, p2/Z, [x11, #2, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "ld1h { z16.h }, p2/Z, [x9]\n" + "zip1 z21.h, z18.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x9, #1, MUL VL]\n" + "zip2 z16.h, z18.h, z16.h\n" + "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n" + "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n" + "zip1 z18.h, z24.h, z17.h\n" + "st1h { z21.h }, p2, [x10]\n" + "addvl x9, x9, #4\n" + "zip2 z17.h, z24.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #1, MUL VL]\n" + "zip1 z16.h, z23.h, z20.h\n" + "st1h { z18.h }, p2, [x10, #2, MUL VL]\n" + "zip2 z18.h, z23.h, z20.h\n" + "st1h { z17.h }, p2, [x10, #3, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "zip1 z17.h, z22.h, z19.h\n" + "st1h { z16.h }, p2, [x10]\n" + "zip2 z16.h, z22.h, z19.h\n" + "st1h { z18.h }, p2, [x10, #1, MUL VL]\n" + "st1h { z17.h }, p2, [x10, #2, MUL VL]\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "mov x19, x20\n" + "decw x20, ALL, MUL #4\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x11]\n" + "ld1h { z16.h }, p0/Z, [x9]\n" + "zip1 z20.h, z17.h, z16.h\n" + "dech x19\n" + "zip2 z19.h, z17.h, z16.h\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z18.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #2\n" + "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n" + "zip1 z17.h, z18.h, z16.h\n" + "st1h { z20.h }, p2, [x10]\n" + "addvl x9, x9, #2\n" + "zip2 z16.h, z18.h, z16.h\n" + "st1h { z19.h }, p2, [x10, #1, MUL VL]\n" + "cmp x20, #0x0\n" + "st1h { z17.h }, p2, [x10, #2, MUL VL]\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #4\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<4, 2, true, VLType::SVE>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_4VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp new file mode 100644 index 0000000000..67ef738645 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_6VL_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 8) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 6 * roundup(height, 8) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "1:" // Main row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add %x[in], x21, %x[in_stride]\n" + "cmp %x[height], #0x7\n" + "csel x21, x21, %x[pad_row], GT\n" + "csel x22, x22, %x[pad_row], GE\n" + "cmp %x[height], #0x5\n" + "csel x23, x23, %x[pad_row], GT\n" + "csel x24, x24, %x[pad_row], GE\n" + "cmp %x[height], #0x3\n" + "csel x25, x25, %x[pad_row], GT\n" + "csel x26, x26, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x27, x27, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x8\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #3\n" + "cmp x20, x19\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1b { z22.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z21.b }, p1/Z, [x9, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1b { z12.b }, p1/Z, [x9, #2, MUL VL]\n" + "addvl x9, x9, #3\n" + "ld1b { z20.b }, p1/Z, [x27]\n" + "ld1b { z11.b }, p1/Z, [x27, #1, MUL VL]\n" + "ld1b { z10.b }, p1/Z, [x27, #2, MUL VL]\n" + "addvl x27, x27, #3\n" + "ld1b { z19.b }, p1/Z, [x26]\n" + "ld1b { z9.b }, p1/Z, [x26, #1, MUL VL]\n" + "ld1b { z8.b }, p1/Z, [x26, #2, MUL VL]\n" + "addvl x26, x26, #3\n" + "ld1b { z7.b }, p1/Z, [x25]\n" + "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n" + "ld1b { z5.b }, p1/Z, [x25, #2, MUL VL]\n" + "addvl x25, x25, #3\n" + "ld1b { z16.b }, p1/Z, [x24]\n" + "zip1 z18.b, z22.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x24, #1, MUL VL]\n" + "zip2 z4.b, z22.b, z16.b\n" + "ld1b { z3.b }, p1/Z, [x24, #2, MUL VL]\n" + "addvl x24, x24, #3\n" + "zip1 z2.b, z21.b, z17.b\n" + "ld1b { z16.b }, p1/Z, [x23]\n" + "zip2 z1.b, z21.b, z17.b\n" + "ld1b { z0.b }, p1/Z, [x23, #1, MUL VL]\n" + "zip1 z31.b, z12.b, z3.b\n" + "ld1b { z30.b }, p1/Z, [x23, #2, MUL VL]\n" + "addvl x23, x23, #3\n" + "zip1 z29.b, z20.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x22]\n" + "zip2 z28.b, z20.b, z16.b\n" + "ld1b { z27.b }, p1/Z, [x22, #1, MUL VL]\n" + "zip1 z26.b, z11.b, z0.b\n" + "ld1b { z25.b }, p1/Z, [x22, #2, MUL VL]\n" + "addvl x22, x22, #3\n" + "zip1 z16.b, z19.b, z17.b\n" + "ld1b { z24.b }, p1/Z, [x21]\n" + "zip2 z21.b, z19.b, z17.b\n" + "ld1b { z22.b }, p1/Z, [x21, #1, MUL VL]\n" + "zip1 z20.b, z18.b, z16.b\n" + "ld1b { z23.b }, p1/Z, [x21, #2, MUL VL]\n" + "addvl x21, x21, #3\n" + "zip1 z19.b, z7.b, z24.b\n" + "zip2 z18.b, z18.b, z16.b\n" + "zip1 z17.b, z29.b, z19.b\n" + "zip1 z16.b, z20.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z20.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z17.b, z29.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z18.b, z4.b, z21.b\n" + "zip2 z19.b, z7.b, z24.b\n" + "zip1 z17.b, z28.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z18.b, z4.b, z21.b\n" + "zip2 z17.b, z28.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip1 z20.b, z9.b, z27.b\n" + "zip1 z18.b, z2.b, z20.b\n" + "zip1 z19.b, z6.b, z22.b\n" + "zip1 z17.b, z26.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip2 z18.b, z2.b, z20.b\n" + "zip2 z17.b, z26.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z21.b, z9.b, z27.b\n" + "zip2 z20.b, z11.b, z0.b\n" + "zip1 z18.b, z1.b, z21.b\n" + "zip2 z19.b, z6.b, z22.b\n" + "zip1 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z18.b, z1.b, z21.b\n" + "zip2 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z21.b, z8.b, z25.b\n" + "zip1 z18.b, z31.b, z21.b\n" + "zip1 z20.b, z10.b, z30.b\n" + "zip1 z19.b, z5.b, z23.b\n" + "zip1 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip2 z18.b, z31.b, z21.b\n" + "zip2 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z22.b, z12.b, z3.b\n" + "zip2 z21.b, z8.b, z25.b\n" + "zip1 z18.b, z22.b, z21.b\n" + "zip2 z20.b, z10.b, z30.b\n" + "zip2 z19.b, z5.b, z23.b\n" + "zip1 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip2 z18.b, z22.b, z21.b\n" + "zip2 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x20, 5f\n" + "4:" // Main row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z18.b }, p0/Z, [x9]\n" + "incd x9, ALL, MUL #6\n" + "ld1b { z28.b }, p0/Z, [x27]\n" + "incd x27, ALL, MUL #6\n" + "ld1b { z17.b }, p0/Z, [x26]\n" + "incd x26, ALL, MUL #6\n" + "ld1b { z27.b }, p0/Z, [x25]\n" + "incd x25, ALL, MUL #6\n" + "ld1b { z16.b }, p0/Z, [x24]\n" + "zip1 z26.b, z18.b, z16.b\n" + "ld1b { z25.b }, p0/Z, [x23]\n" + "incd x24, ALL, MUL #6\n" + "zip2 z24.b, z18.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x22]\n" + "incd x23, ALL, MUL #6\n" + "zip1 z23.b, z28.b, z25.b\n" + "ld1b { z22.b }, p0/Z, [x21]\n" + "incd x22, ALL, MUL #6\n" + "zip1 z21.b, z17.b, z16.b\n" + "incd x21, ALL, MUL #6\n" + "zip2 z20.b, z17.b, z16.b\n" + "decd x20, ALL, MUL #6\n" + "zip1 z18.b, z26.b, z21.b\n" + "cmp x20, #0x0\n" + "zip1 z19.b, z27.b, z22.b\n" + "zip1 z17.b, z23.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z18.b, z26.b, z21.b\n" + "zip2 z17.b, z23.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z18.b, z24.b, z20.b\n" + "zip2 z17.b, z28.b, z25.b\n" + "zip2 z16.b, z27.b, z22.b\n" + "zip1 z17.b, z17.b, z16.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #6\n" + "cmp %x[height], #0x1\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<6, 8, true, VLType::SVE>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_6VL_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<6, 8, true, VLType::SVE>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_6VL_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp new file mode 100644 index 0000000000..19d3d9dfe4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 6 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p2.b\n" + "cmp %x[height], #0x8\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add %x[in], x23, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x22, %x[width]\n" + "cnth x21, ALL, MUL #3\n" + "cmp x22, x21\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1h { z19.h }, p2/Z, [x11]\n" + "mov x20, x10\n" + "ld1h { z18.h }, p2/Z, [x11, #1, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "ld1h { z10.h }, p2/Z, [x11, #2, MUL VL]\n" + "addvl x11, x11, #3\n" + "ld1h { z24.h }, p2/Z, [x9]\n" + "mov x19, x10\n" + "ld1h { z23.h }, p2/Z, [x9, #1, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "ld1h { z9.h }, p2/Z, [x9, #2, MUL VL]\n" + "addvl x9, x9, #3\n" + "ld1h { z16.h }, p2/Z, [x28]\n" + "zip1 z22.h, z19.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x28, #1, MUL VL]\n" + "sub x22, x22, x21\n" + "zip2 z21.h, z19.h, z16.h\n" + "ld1h { z8.h }, p2/Z, [x28, #2, MUL VL]\n" + "addvl x28, x28, #3\n" + "zip1 z20.h, z18.h, z17.h\n" + "ld1h { z16.h }, p2/Z, [x27]\n" + "cmp x22, x21\n" + "zip2 z7.h, z18.h, z17.h\n" + "ld1h { z19.h }, p2/Z, [x27, #1, MUL VL]\n" + "zip1 z6.h, z10.h, z8.h\n" + "ld1h { z5.h }, p2/Z, [x27, #2, MUL VL]\n" + "addvl x27, x27, #3\n" + "zip1 z17.h, z24.h, z16.h\n" + "ld1h { z18.h }, p2/Z, [x26]\n" + "zip2 z16.h, z24.h, z16.h\n" + "ld1h { z4.h }, p2/Z, [x26, #1, MUL VL]\n" + "zip1 z3.h, z22.h, z17.h\n" + "ld1h { z2.h }, p2/Z, [x26, #2, MUL VL]\n" + "addvl x26, x26, #3\n" + "zip2 z1.h, z22.h, z17.h\n" + "ld1h { z0.h }, p2/Z, [x25]\n" + "zip1 z31.h, z21.h, z16.h\n" + "ld1h { z30.h }, p2/Z, [x25, #1, MUL VL]\n" + "zip2 z29.h, z21.h, z16.h\n" + "ld1h { z28.h }, p2/Z, [x25, #2, MUL VL]\n" + "addvl x25, x25, #3\n" + "zip1 z16.h, z23.h, z19.h\n" + "ld1h { z17.h }, p2/Z, [x24]\n" + "zip2 z27.h, z23.h, z19.h\n" + "ld1h { z26.h }, p2/Z, [x24, #1, MUL VL]\n" + "zip1 z25.h, z20.h, z16.h\n" + "ld1h { z24.h }, p2/Z, [x24, #2, MUL VL]\n" + "addvl x24, x24, #3\n" + "zip2 z23.h, z20.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x23]\n" + "zip1 z20.h, z18.h, z17.h\n" + "ld1h { z22.h }, p2/Z, [x23, #1, MUL VL]\n" + "zip2 z19.h, z18.h, z17.h\n" + "ld1h { z21.h }, p2/Z, [x23, #2, MUL VL]\n" + "addvl x23, x23, #3\n" + "zip1 z18.h, z0.h, z16.h\n" + "st1h { z3.h }, p2, [x20]\n" + "zip2 z17.h, z0.h, z16.h\n" + "st1h { z1.h }, p2, [x20, #1, MUL VL]\n" + "zip1 z16.h, z20.h, z18.h\n" + "st1h { z31.h }, p2, [x20, #2, MUL VL]\n" + "zip2 z18.h, z20.h, z18.h\n" + "st1h { z29.h }, p2, [x20, #3, MUL VL]\n" + "zip1 z20.h, z19.h, z17.h\n" + "st1h { z25.h }, p2, [x20, #4, MUL VL]\n" + "zip2 z19.h, z19.h, z17.h\n" + "st1h { z23.h }, p2, [x20, #5, MUL VL]\n" + "zip1 z17.h, z4.h, z26.h\n" + "st1h { z16.h }, p2, [x20, #6, MUL VL]\n" + "zip1 z16.h, z30.h, z22.h\n" + "st1h { z18.h }, p2, [x20, #7, MUL VL]\n" + "addvl x20, x20, #12\n" + "zip1 z18.h, z17.h, z16.h\n" + "st1h { z20.h }, p2, [x20, #-4, MUL VL]\n" + "zip2 z16.h, z17.h, z16.h\n" + "st1h { z19.h }, p2, [x20, #-3, MUL VL]\n" + "zip1 z17.h, z7.h, z27.h\n" + "st1h { z18.h }, p2, [x20, #-2, MUL VL]\n" + "zip2 z18.h, z7.h, z27.h\n" + "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n" + "zip1 z16.h, z9.h, z5.h\n" + "st1h { z17.h }, p2, [x19]\n" + "zip1 z17.h, z6.h, z16.h\n" + "st1h { z18.h }, p2, [x19, #1, MUL VL]\n" + "zip2 z16.h, z6.h, z16.h\n" + "st1h { z17.h }, p2, [x19, #2, MUL VL]\n" + "zip2 z18.h, z10.h, z8.h\n" + "st1h { z16.h }, p2, [x19, #3, MUL VL]\n" + "zip2 z17.h, z9.h, z5.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #5, MUL VL]\n" + "zip2 z18.h, z4.h, z26.h\n" + "zip2 z17.h, z30.h, z22.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #6, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "zip1 z18.h, z2.h, z24.h\n" + "zip1 z17.h, z28.h, z21.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-3, MUL VL]\n" + "zip2 z18.h, z2.h, z24.h\n" + "zip2 z17.h, z28.h, z21.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-2, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x20, x22\n" + "mov x19, x10\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z18.h }, p1/Z, [x11]\n" + "ld1h { z23.h }, p1/Z, [x9]\n" + "dech x20\n" + "ld1h { z16.h }, p1/Z, [x28]\n" + "zip1 z17.h, z18.h, z16.h\n" + "ld1h { z20.h }, p1/Z, [x27]\n" + "whilelt p0.h, XZR, x20\n" + "zip2 z22.h, z18.h, z16.h\n" + "ld1h { z21.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #1\n" + "zip1 z16.h, z23.h, z20.h\n" + "ld1h { z19.h }, p0/Z, [x9, #1, MUL VL]\n" + "incd x11, ALL, MUL #4\n" + "zip1 z0.h, z17.h, z16.h\n" + "ld1h { z18.h }, p0/Z, [x28, #1, MUL VL]\n" + "addvl x9, x9, #1\n" + "zip2 z31.h, z17.h, z16.h\n" + "ld1h { z17.h }, p0/Z, [x27, #1, MUL VL]\n" + "incd x9, ALL, MUL #4\n" + "zip2 z16.h, z23.h, z20.h\n" + "ld1h { z30.h }, p1/Z, [x26]\n" + "addvl x28, x28, #1\n" + "zip1 z20.h, z22.h, z16.h\n" + "ld1h { z29.h }, p0/Z, [x26, #1, MUL VL]\n" + "incd x28, ALL, MUL #4\n" + "zip2 z28.h, z22.h, z16.h\n" + "ld1h { z27.h }, p1/Z, [x25]\n" + "addvl x27, x27, #1\n" + "zip1 z18.h, z21.h, z18.h\n" + "ld1h { z26.h }, p0/Z, [x25, #1, MUL VL]\n" + "incd x27, ALL, MUL #4\n" + "zip1 z17.h, z19.h, z17.h\n" + "ld1h { z16.h }, p1/Z, [x24]\n" + "addvl x26, x26, #1\n" + "zip1 z25.h, z18.h, z17.h\n" + "ld1h { z24.h }, p0/Z, [x24, #1, MUL VL]\n" + "incd x26, ALL, MUL #4\n" + "zip2 z23.h, z18.h, z17.h\n" + "ld1h { z22.h }, p1/Z, [x23]\n" + "addvl x25, x25, #1\n" + "zip1 z19.h, z30.h, z16.h\n" + "ld1h { z21.h }, p0/Z, [x23, #1, MUL VL]\n" + "incd x25, ALL, MUL #4\n" + "zip2 z17.h, z30.h, z16.h\n" + "st1h { z0.h }, p2, [x19]\n" + "addvl x24, x24, #1\n" + "zip1 z16.h, z27.h, z22.h\n" + "st1h { z31.h }, p2, [x19, #1, MUL VL]\n" + "incd x24, ALL, MUL #4\n" + "zip1 z18.h, z19.h, z16.h\n" + "st1h { z20.h }, p2, [x19, #2, MUL VL]\n" + "addvl x23, x23, #1\n" + "zip2 z20.h, z19.h, z16.h\n" + "st1h { z28.h }, p2, [x19, #3, MUL VL]\n" + "incd x23, ALL, MUL #4\n" + "zip2 z16.h, z27.h, z22.h\n" + "st1h { z25.h }, p2, [x19, #4, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "zip1 z19.h, z17.h, z16.h\n" + "st1h { z23.h }, p2, [x19, #5, MUL VL]\n" + "decd x22, ALL, MUL #6\n" + "zip2 z17.h, z17.h, z16.h\n" + "st1h { z18.h }, p2, [x19, #6, MUL VL]\n" + "cmp x22, #0x0\n" + "zip1 z18.h, z29.h, z24.h\n" + "st1h { z20.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "zip1 z16.h, z26.h, z21.h\n" + "st1h { z19.h }, p2, [x19, #-4, MUL VL]\n" + "st1h { z17.h }, p2, [x19, #-3, MUL VL]\n" + "zip1 z17.h, z18.h, z16.h\n" + "zip2 z16.h, z18.h, z16.h\n" + "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #12\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add %x[in], x27, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x27, x27, %x[pad_row], GT\n" + "csel x28, x28, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x9, x9, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x20, %x[width]\n" + "cnth x19, ALL, MUL #3\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1h { z19.h }, p2/Z, [x11]\n" + "sub x20, x20, x19\n" + "ld1h { z18.h }, p2/Z, [x11, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1h { z30.h }, p2/Z, [x11, #2, MUL VL]\n" + "addvl x11, x11, #3\n" + "ld1h { z29.h }, p2/Z, [x9]\n" + "ld1h { z28.h }, p2/Z, [x9, #1, MUL VL]\n" + "ld1h { z27.h }, p2/Z, [x9, #2, MUL VL]\n" + "addvl x9, x9, #3\n" + "ld1h { z16.h }, p2/Z, [x28]\n" + "zip1 z26.h, z19.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x28, #1, MUL VL]\n" + "zip2 z25.h, z19.h, z16.h\n" + "ld1h { z24.h }, p2/Z, [x28, #2, MUL VL]\n" + "addvl x28, x28, #3\n" + "zip1 z23.h, z18.h, z17.h\n" + "ld1h { z16.h }, p2/Z, [x27]\n" + "zip2 z22.h, z18.h, z17.h\n" + "ld1h { z21.h }, p2/Z, [x27, #1, MUL VL]\n" + "zip1 z20.h, z30.h, z24.h\n" + "ld1h { z19.h }, p2/Z, [x27, #2, MUL VL]\n" + "addvl x27, x27, #3\n" + "zip1 z18.h, z29.h, z16.h\n" + "zip2 z17.h, z29.h, z16.h\n" + "zip1 z16.h, z26.h, z18.h\n" + "st1h { z16.h }, p2, [x10]\n" + "zip2 z16.h, z26.h, z18.h\n" + "st1h { z16.h }, p2, [x10, #1, MUL VL]\n" + "zip1 z16.h, z25.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #2, MUL VL]\n" + "zip2 z16.h, z25.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "zip1 z17.h, z28.h, z21.h\n" + "zip1 z16.h, z23.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #4, MUL VL]\n" + "zip2 z16.h, z23.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #5, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "zip2 z18.h, z28.h, z21.h\n" + "zip1 z17.h, z27.h, z19.h\n" + "zip1 z16.h, z22.h, z18.h\n" + "st1h { z16.h }, p2, [x10]\n" + "zip2 z16.h, z22.h, z18.h\n" + "st1h { z16.h }, p2, [x10, #1, MUL VL]\n" + "zip1 z16.h, z20.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #2, MUL VL]\n" + "zip2 z16.h, z20.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "zip2 z18.h, z30.h, z24.h\n" + "zip2 z17.h, z27.h, z19.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #5, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "mov x19, x20\n" + "decd x20, ALL, MUL #6\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x11]\n" + "ld1h { z25.h }, p0/Z, [x9]\n" + "dech x19\n" + "ld1h { z16.h }, p0/Z, [x28]\n" + "zip1 z18.h, z17.h, z16.h\n" + "ld1h { z24.h }, p0/Z, [x27]\n" + "whilelt p0.h, XZR, x19\n" + "zip2 z23.h, z17.h, z16.h\n" + "ld1h { z22.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #1\n" + "zip1 z16.h, z25.h, z24.h\n" + "ld1h { z21.h }, p0/Z, [x9, #1, MUL VL]\n" + "incd x11, ALL, MUL #4\n" + "zip1 z17.h, z18.h, z16.h\n" + "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n" + "addvl x9, x9, #1\n" + "zip2 z18.h, z18.h, z16.h\n" + "ld1h { z19.h }, p0/Z, [x27, #1, MUL VL]\n" + "incd x9, ALL, MUL #4\n" + "zip2 z16.h, z25.h, z24.h\n" + "st1h { z17.h }, p2, [x10]\n" + "addvl x28, x28, #1\n" + "zip1 z17.h, z23.h, z16.h\n" + "st1h { z18.h }, p2, [x10, #1, MUL VL]\n" + "incd x28, ALL, MUL #4\n" + "zip2 z16.h, z23.h, z16.h\n" + "st1h { z17.h }, p2, [x10, #2, MUL VL]\n" + "addvl x27, x27, #1\n" + "zip1 z18.h, z22.h, z20.h\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "incd x27, ALL, MUL #4\n" + "zip1 z17.h, z21.h, z19.h\n" + "cmp x20, #0x0\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #5, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #6\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<6, 4, true, VLType::SVE>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_6VL_2x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..94ce157185 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_6VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 6 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p3.b\n" + "1:" // Main row loop: Head + "mov x25, %x[in]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "mov x22, %x[width]\n" + "cnth x19, ALL, MUL #3\n" + "add x21, x23, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "add %x[in], x21, %x[in_stride]\n" + "csel x21, x21, %x[pad_row], GT\n" + "csel x23, x23, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x24, x24, %x[pad_row], GT\n" + "cmp x22, x19\n" + "mov x20, %x[out]\n" + "sub %x[height], %x[height], #0x4\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1w { z17.s }, p3/Z, [x25]\n" + "ld1w { z18.s }, p3/Z, [x25, #1, MUL VL]\n" + "sub x22, x22, x19\n" + "cmp x22, x19\n" + "ld1w { z19.s }, p3/Z, [x25, #2, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23]\n" + "zip1 z21.s, z17.s, z16.s\n" + "zip2 z20.s, z17.s, z16.s\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23, #2, MUL VL]\n" + "zip1 z29.s, z18.s, z17.s\n" + "zip2 z28.s, z18.s, z17.s\n" + "ld1w { z17.s }, p3/Z, [x25, #3, MUL VL]\n" + "ld1w { z18.s }, p3/Z, [x25, #4, MUL VL]\n" + "zip1 z27.s, z19.s, z16.s\n" + "zip2 z26.s, z19.s, z16.s\n" + "ld1w { z19.s }, p3/Z, [x25, #5, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23, #3, MUL VL]\n" + "zip1 z25.s, z17.s, z16.s\n" + "zip2 z24.s, z17.s, z16.s\n" + "ld1w { z17.s }, p3/Z, [x23, #4, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23, #5, MUL VL]\n" + "zip1 z12.s, z18.s, z17.s\n" + "zip2 z11.s, z18.s, z17.s\n" + "ld1w { z18.s }, p3/Z, [x24]\n" + "ld1w { z23.s }, p3/Z, [x24, #1, MUL VL]\n" + "zip1 z10.s, z19.s, z16.s\n" + "zip2 z9.s, z19.s, z16.s\n" + "ld1w { z22.s }, p3/Z, [x24, #2, MUL VL]\n" + "ld1w { z17.s }, p3/Z, [x21]\n" + ".inst 0x658aaea8 // bfcvt z8.h, p3/M, z21.s\n" + "zip1 z7.s, z18.s, z17.s\n" + "ld1w { z16.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x21, #2, MUL VL]\n" + ".inst 0x658aae86 // bfcvt z6.h, p3/M, z20.s\n" + "zip2 z5.s, z18.s, z17.s\n" + "ld1w { z20.s }, p3/Z, [x24, #3, MUL VL]\n" + "ld1w { z19.s }, p3/Z, [x24, #4, MUL VL]\n" + ".inst 0x658aafa4 // bfcvt z4.h, p3/M, z29.s\n" + "zip1 z3.s, z23.s, z16.s\n" + "ld1w { z2.s }, p3/Z, [x24, #5, MUL VL]\n" + "ld1w { z18.s }, p3/Z, [x21, #3, MUL VL]\n" + ".inst 0x658aaf81 // bfcvt z1.h, p3/M, z28.s\n" + "zip2 z0.s, z23.s, z16.s\n" + "ld1w { z17.s }, p3/Z, [x21, #4, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x21, #5, MUL VL]\n" + ".inst 0x658aaf7f // bfcvt z31.h, p3/M, z27.s\n" + "zip1 z30.s, z22.s, z21.s\n" + ".inst 0x658aaf5d // bfcvt z29.h, p3/M, z26.s\n" + "zip2 z28.s, z22.s, z21.s\n" + "addvl x25, x25, #6\n" + "addvl x24, x24, #6\n" + ".inst 0x658aaf3b // bfcvt z27.h, p3/M, z25.s\n" + "zip1 z26.s, z20.s, z18.s\n" + "addvl x23, x23, #6\n" + "addvl x21, x21, #6\n" + ".inst 0x658aaf19 // bfcvt z25.h, p3/M, z24.s\n" + "zip2 z24.s, z20.s, z18.s\n" + ".inst 0x658aad97 // bfcvt z23.h, p3/M, z12.s\n" + "zip1 z22.s, z19.s, z17.s\n" + ".inst 0x658aad75 // bfcvt z21.h, p3/M, z11.s\n" + "zip2 z20.s, z19.s, z17.s\n" + ".inst 0x658aad53 // bfcvt z19.h, p3/M, z10.s\n" + "zip1 z18.s, z2.s, z16.s\n" + ".inst 0x658aad31 // bfcvt z17.h, p3/M, z9.s\n" + "zip2 z16.s, z2.s, z16.s\n" + ".inst 0x648aace8 // bfcvtnt z8.h, p3/M, z7.s\n" + ".inst 0x648aaca6 // bfcvtnt z6.h, p3/M, z5.s\n" + "st1h { z8.h }, p3, [x20]\n" + ".inst 0x648aac64 // bfcvtnt z4.h, p3/M, z3.s\n" + ".inst 0x648aac01 // bfcvtnt z1.h, p3/M, z0.s\n" + "st1h { z6.h }, p3, [x20, #1, MUL VL]\n" + ".inst 0x648aafdf // bfcvtnt z31.h, p3/M, z30.s\n" + ".inst 0x648aaf9d // bfcvtnt z29.h, p3/M, z28.s\n" + "st1h { z4.h }, p3, [x20, #2, MUL VL]\n" + "st1h { z1.h }, p3, [x20, #3, MUL VL]\n" + ".inst 0x648aaf5b // bfcvtnt z27.h, p3/M, z26.s\n" + ".inst 0x648aaf19 // bfcvtnt z25.h, p3/M, z24.s\n" + "st1h { z31.h }, p3, [x20, #4, MUL VL]\n" + ".inst 0x648aaed7 // bfcvtnt z23.h, p3/M, z22.s\n" + ".inst 0x648aae95 // bfcvtnt z21.h, p3/M, z20.s\n" + "st1h { z29.h }, p3, [x20, #5, MUL VL]\n" + "add x20, x20, %x[out_stride]\n" + ".inst 0x648aae53 // bfcvtnt z19.h, p3/M, z18.s\n" + ".inst 0x648aae11 // bfcvtnt z17.h, p3/M, z16.s\n" + "st1h { z27.h }, p3, [x20]\n" + "st1h { z25.h }, p3, [x20, #1, MUL VL]\n" + "st1h { z23.h }, p3, [x20, #2, MUL VL]\n" + "st1h { z21.h }, p3, [x20, #3, MUL VL]\n" + "st1h { z19.h }, p3, [x20, #4, MUL VL]\n" + "st1h { z17.h }, p3, [x20, #5, MUL VL]\n" + "add x20, x20, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x19, x22\n" + "whilelt p2.s, XZR, x19\n" + "ld1w { z20.s }, p2/Z, [x25]\n" + "ld1w { z19.s }, p2/Z, [x23]\n" + "decw x19\n" + "whilelt p1.s, XZR, x19\n" + "ld1w { z18.s }, p1/Z, [x25, #1, MUL VL]\n" + "ld1w { z17.s }, p1/Z, [x23, #1, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z25.s }, p0/Z, [x25, #2, MUL VL]\n" + "ld1w { z16.s }, p0/Z, [x23, #2, MUL VL]\n" + "ld1w { z24.s }, p2/Z, [x24]\n" + "ld1w { z30.s }, p1/Z, [x24, #1, MUL VL]\n" + "zip1 z23.s, z20.s, z19.s\n" + "zip2 z22.s, z20.s, z19.s\n" + "ld1w { z29.s }, p0/Z, [x24, #2, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x21]\n" + "zip1 z20.s, z18.s, z17.s\n" + "zip2 z19.s, z18.s, z17.s\n" + "ld1w { z18.s }, p1/Z, [x21, #1, MUL VL]\n" + "ld1w { z28.s }, p0/Z, [x21, #2, MUL VL]\n" + "zip1 z17.s, z25.s, z16.s\n" + "zip2 z16.s, z25.s, z16.s\n" + "decd x22, ALL, MUL #6\n" + ".inst 0x658aaefb // bfcvt z27.h, p3/M, z23.s\n" + "zip1 z26.s, z24.s, z21.s\n" + "cmp x22, #0x0\n" + ".inst 0x658aaed9 // bfcvt z25.h, p3/M, z22.s\n" + "zip2 z24.s, z24.s, z21.s\n" + "addvl x25, x25, #3\n" + "addvl x24, x24, #3\n" + ".inst 0x658aae97 // bfcvt z23.h, p3/M, z20.s\n" + "zip1 z22.s, z30.s, z18.s\n" + "addvl x23, x23, #3\n" + "addvl x21, x21, #3\n" + ".inst 0x658aae75 // bfcvt z21.h, p3/M, z19.s\n" + "zip2 z20.s, z30.s, z18.s\n" + ".inst 0x658aae33 // bfcvt z19.h, p3/M, z17.s\n" + "zip1 z18.s, z29.s, z28.s\n" + ".inst 0x658aae11 // bfcvt z17.h, p3/M, z16.s\n" + "zip2 z16.s, z29.s, z28.s\n" + ".inst 0x648aaf5b // bfcvtnt z27.h, p3/M, z26.s\n" + ".inst 0x648aaf19 // bfcvtnt z25.h, p3/M, z24.s\n" + "st1h { z27.h }, p3, [x20]\n" + ".inst 0x648aaed7 // bfcvtnt z23.h, p3/M, z22.s\n" + ".inst 0x648aae95 // bfcvtnt z21.h, p3/M, z20.s\n" + "st1h { z25.h }, p3, [x20, #1, MUL VL]\n" + ".inst 0x648aae53 // bfcvtnt z19.h, p3/M, z18.s\n" + ".inst 0x648aae11 // bfcvtnt z17.h, p3/M, z16.s\n" + "st1h { z23.h }, p3, [x20, #2, MUL VL]\n" + "st1h { z21.h }, p3, [x20, #3, MUL VL]\n" + "st1h { z19.h }, p3, [x20, #4, MUL VL]\n" + "st1h { z17.h }, p3, [x20, #5, MUL VL]\n" + "add x20, x20, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #6\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace +template<> +void Transform<6, 4, true, VLType::SVE>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_6VL_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp new file mode 100644 index 0000000000..46b160b071 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height) +{ + uint32_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint32_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint32_t)); + } + + size_t out_stride = 6 * roundup(height, 2) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p2.b\n" + "cmp %x[height], #0x4\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x27, %x[in]\n" + "mov x26, %x[out]\n" + "add x25, x27, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add %x[in], x23, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x22, %x[width]\n" + "cntw x21, ALL, MUL #6\n" + "cmp x22, x21\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1w { z19.s }, p2/Z, [x27]\n" + "mov x20, x26\n" + "ld1w { z18.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "ld1w { z21.s }, p2/Z, [x27, #2, MUL VL]\n" + "mov x19, x26\n" + "ld1w { z26.s }, p2/Z, [x27, #3, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "ld1w { z25.s }, p2/Z, [x27, #4, MUL VL]\n" + "sub x22, x22, x21\n" + "ld1w { z24.s }, p2/Z, [x27, #5, MUL VL]\n" + "addvl x27, x27, #6\n" + "ld1w { z16.s }, p2/Z, [x25]\n" + "zip1 z23.s, z19.s, z16.s\n" + "ld1w { z17.s }, p2/Z, [x25, #1, MUL VL]\n" + "cmp x22, x21\n" + "zip2 z9.s, z19.s, z16.s\n" + "ld1w { z20.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x25, #3, MUL VL]\n" + "zip1 z8.s, z18.s, z17.s\n" + "ld1w { z16.s }, p2/Z, [x25, #4, MUL VL]\n" + "zip2 z7.s, z18.s, z17.s\n" + "ld1w { z18.s }, p2/Z, [x25, #5, MUL VL]\n" + "addvl x25, x25, #6\n" + "zip1 z6.s, z21.s, z20.s\n" + "ld1w { z17.s }, p2/Z, [x24]\n" + "zip2 z5.s, z21.s, z20.s\n" + "ld1w { z22.s }, p2/Z, [x24, #1, MUL VL]\n" + "zip1 z4.s, z26.s, z19.s\n" + "ld1w { z21.s }, p2/Z, [x24, #2, MUL VL]\n" + "zip2 z3.s, z26.s, z19.s\n" + "ld1w { z2.s }, p2/Z, [x24, #3, MUL VL]\n" + "zip1 z1.s, z25.s, z16.s\n" + "ld1w { z0.s }, p2/Z, [x24, #4, MUL VL]\n" + "zip2 z31.s, z25.s, z16.s\n" + "ld1w { z30.s }, p2/Z, [x24, #5, MUL VL]\n" + "addvl x24, x24, #6\n" + "zip1 z29.s, z24.s, z18.s\n" + "ld1w { z16.s }, p2/Z, [x23]\n" + "zip2 z28.s, z24.s, z18.s\n" + "ld1w { z20.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z27.s, z17.s, z16.s\n" + "ld1w { z18.s }, p2/Z, [x23, #3, MUL VL]\n" + "zip2 z26.s, z17.s, z16.s\n" + "ld1w { z17.s }, p2/Z, [x23, #4, MUL VL]\n" + "zip1 z25.s, z22.s, z20.s\n" + "ld1w { z16.s }, p2/Z, [x23, #5, MUL VL]\n" + "addvl x23, x23, #6\n" + "zip2 z24.s, z22.s, z20.s\n" + "st1w { z23.s }, p2, [x20]\n" + "zip1 z23.s, z21.s, z19.s\n" + "st1w { z9.s }, p2, [x20, #1, MUL VL]\n" + "zip2 z22.s, z21.s, z19.s\n" + "st1w { z8.s }, p2, [x20, #2, MUL VL]\n" + "zip1 z21.s, z2.s, z18.s\n" + "st1w { z7.s }, p2, [x20, #3, MUL VL]\n" + "zip2 z20.s, z2.s, z18.s\n" + "st1w { z6.s }, p2, [x20, #4, MUL VL]\n" + "zip1 z19.s, z0.s, z17.s\n" + "st1w { z5.s }, p2, [x20, #5, MUL VL]\n" + "zip2 z18.s, z0.s, z17.s\n" + "st1w { z27.s }, p2, [x20, #6, MUL VL]\n" + "zip1 z17.s, z30.s, z16.s\n" + "st1w { z26.s }, p2, [x20, #7, MUL VL]\n" + "addvl x20, x20, #12\n" + "zip2 z16.s, z30.s, z16.s\n" + "st1w { z25.s }, p2, [x20, #-4, MUL VL]\n" + "st1w { z24.s }, p2, [x20, #-3, MUL VL]\n" + "st1w { z23.s }, p2, [x20, #-2, MUL VL]\n" + "st1w { z22.s }, p2, [x20, #-1, MUL VL]\n" + "st1w { z4.s }, p2, [x19]\n" + "st1w { z3.s }, p2, [x19, #1, MUL VL]\n" + "st1w { z1.s }, p2, [x19, #2, MUL VL]\n" + "st1w { z31.s }, p2, [x19, #3, MUL VL]\n" + "st1w { z29.s }, p2, [x19, #4, MUL VL]\n" + "st1w { z28.s }, p2, [x19, #5, MUL VL]\n" + "st1w { z21.s }, p2, [x19, #6, MUL VL]\n" + "st1w { z20.s }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "st1w { z19.s }, p2, [x19, #-4, MUL VL]\n" + "st1w { z18.s }, p2, [x19, #-3, MUL VL]\n" + "st1w { z17.s }, p2, [x19, #-2, MUL VL]\n" + "st1w { z16.s }, p2, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x20, x22\n" + "mov x19, x26\n" + "whilelt p0.s, XZR, x20\n" + "ld1w { z18.s }, p0/Z, [x27]\n" + "ld1w { z16.s }, p0/Z, [x25]\n" + "zip1 z28.s, z18.s, z16.s\n" + "ld1w { z17.s }, p0/Z, [x24]\n" + "decw x20\n" + "zip2 z27.s, z18.s, z16.s\n" + "ld1w { z16.s }, p0/Z, [x23]\n" + "whilelt p1.s, XZR, x20\n" + "zip1 z26.s, z17.s, z16.s\n" + "ld1w { z18.s }, p1/Z, [x27, #1, MUL VL]\n" + "decw x20\n" + "zip2 z25.s, z17.s, z16.s\n" + "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n" + "whilelt p0.s, XZR, x20\n" + "zip1 z24.s, z18.s, z16.s\n" + "ld1w { z17.s }, p0/Z, [x27, #2, MUL VL]\n" + "addvl x27, x27, #3\n" + "zip2 z23.s, z18.s, z16.s\n" + "ld1w { z16.s }, p0/Z, [x25, #2, MUL VL]\n" + "addvl x25, x25, #3\n" + "zip1 z22.s, z17.s, z16.s\n" + "ld1w { z18.s }, p1/Z, [x24, #1, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "zip2 z21.s, z17.s, z16.s\n" + "ld1w { z20.s }, p0/Z, [x24, #2, MUL VL]\n" + "addvl x24, x24, #3\n" + "ld1w { z17.s }, p1/Z, [x23, #1, MUL VL]\n" + "zip1 z19.s, z18.s, z17.s\n" + "ld1w { z16.s }, p0/Z, [x23, #2, MUL VL]\n" + "addvl x23, x23, #3\n" + "zip2 z18.s, z18.s, z17.s\n" + "st1w { z28.s }, p2, [x19]\n" + "decd x22, ALL, MUL #6\n" + "zip1 z17.s, z20.s, z16.s\n" + "st1w { z27.s }, p2, [x19, #1, MUL VL]\n" + "cmp x22, #0x0\n" + "zip2 z16.s, z20.s, z16.s\n" + "st1w { z24.s }, p2, [x19, #2, MUL VL]\n" + "st1w { z23.s }, p2, [x19, #3, MUL VL]\n" + "st1w { z22.s }, p2, [x19, #4, MUL VL]\n" + "st1w { z21.s }, p2, [x19, #5, MUL VL]\n" + "st1w { z26.s }, p2, [x19, #6, MUL VL]\n" + "st1w { z25.s }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #12\n" + "st1w { z19.s }, p2, [x19, #-4, MUL VL]\n" + "st1w { z18.s }, p2, [x19, #-3, MUL VL]\n" + "st1w { z17.s }, p2, [x19, #-2, MUL VL]\n" + "st1w { z16.s }, p2, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #12\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x27, %x[in]\n" + "mov x26, %x[out]\n" + "add x25, x27, %x[in_stride]\n" + "add %x[in], x25, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "csel x25, x25, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x20, %x[width]\n" + "cntw x19, ALL, MUL #6\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1w { z19.s }, p2/Z, [x27]\n" + "sub x20, x20, x19\n" + "ld1w { z18.s }, p2/Z, [x27, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1w { z29.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x27, #3, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x27, #4, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x27, #5, MUL VL]\n" + "addvl x27, x27, #6\n" + "ld1w { z16.s }, p2/Z, [x25]\n" + "zip1 z25.s, z19.s, z16.s\n" + "ld1w { z17.s }, p2/Z, [x25, #1, MUL VL]\n" + "zip2 z24.s, z19.s, z16.s\n" + "ld1w { z16.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [x25, #3, MUL VL]\n" + "zip1 z20.s, z18.s, z17.s\n" + "ld1w { z22.s }, p2/Z, [x25, #4, MUL VL]\n" + "zip2 z19.s, z18.s, z17.s\n" + "ld1w { z21.s }, p2/Z, [x25, #5, MUL VL]\n" + "addvl x25, x25, #6\n" + "zip1 z18.s, z29.s, z16.s\n" + "st1w { z25.s }, p2, [x26]\n" + "zip2 z17.s, z29.s, z16.s\n" + "st1w { z24.s }, p2, [x26, #1, MUL VL]\n" + "zip1 z16.s, z28.s, z23.s\n" + "st1w { z20.s }, p2, [x26, #2, MUL VL]\n" + "zip2 z20.s, z28.s, z23.s\n" + "st1w { z19.s }, p2, [x26, #3, MUL VL]\n" + "zip1 z19.s, z27.s, z22.s\n" + "st1w { z18.s }, p2, [x26, #4, MUL VL]\n" + "zip2 z18.s, z27.s, z22.s\n" + "st1w { z17.s }, p2, [x26, #5, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "zip1 z17.s, z26.s, z21.s\n" + "st1w { z16.s }, p2, [x26]\n" + "zip2 z16.s, z26.s, z21.s\n" + "st1w { z20.s }, p2, [x26, #1, MUL VL]\n" + "st1w { z19.s }, p2, [x26, #2, MUL VL]\n" + "st1w { z18.s }, p2, [x26, #3, MUL VL]\n" + "st1w { z17.s }, p2, [x26, #4, MUL VL]\n" + "st1w { z16.s }, p2, [x26, #5, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "mov x19, x20\n" + "decd x20, ALL, MUL #6\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z17.s }, p0/Z, [x27]\n" + "ld1w { z16.s }, p0/Z, [x25]\n" + "zip1 z22.s, z17.s, z16.s\n" + "decw x19\n" + "zip2 z21.s, z17.s, z16.s\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z17.s }, p0/Z, [x27, #1, MUL VL]\n" + "decw x19\n" + "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n" + "zip1 z20.s, z17.s, z16.s\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z19.s }, p0/Z, [x27, #2, MUL VL]\n" + "zip2 z18.s, z17.s, z16.s\n" + "addvl x27, x27, #3\n" + "ld1w { z16.s }, p0/Z, [x25, #2, MUL VL]\n" + "zip1 z17.s, z19.s, z16.s\n" + "st1w { z22.s }, p2, [x26]\n" + "addvl x25, x25, #3\n" + "zip2 z16.s, z19.s, z16.s\n" + "st1w { z21.s }, p2, [x26, #1, MUL VL]\n" + "cmp x20, #0x0\n" + "st1w { z20.s }, p2, [x26, #2, MUL VL]\n" + "st1w { z18.s }, p2, [x26, #3, MUL VL]\n" + "st1w { z17.s }, p2, [x26, #4, MUL VL]\n" + "st1w { z16.s }, p2, [x26, #5, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #6\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<6, 2, true, VLType::SVE>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_6VL_4x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 4, + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp new file mode 100644 index 0000000000..56b7ed6eda --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 8 * height * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "cmp %x[height], #0x2\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x25, %x[in]\n" + "mov x24, %x[out]\n" + "add x23, x25, %x[in_stride]\n" + "add %x[in], x23, %x[in_stride]\n" + "sub %x[height], %x[height], #0x2\n" + "mov x22, %x[width]\n" + "cntw x21, ALL, MUL #16\n" + "cmp x22, x21\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1w { z15.s }, p1/Z, [x25]\n" + "mov x20, x24\n" + "ld1w { z14.s }, p1/Z, [x25, #1, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "ld1w { z13.s }, p1/Z, [x25, #2, MUL VL]\n" + "mov x19, x24\n" + "ld1w { z12.s }, p1/Z, [x25, #3, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "ld1w { z11.s }, p1/Z, [x25, #4, MUL VL]\n" + "sub x22, x22, x21\n" + "ld1w { z10.s }, p1/Z, [x25, #5, MUL VL]\n" + "cmp x22, x21\n" + "ld1w { z9.s }, p1/Z, [x25, #6, MUL VL]\n" + "ld1w { z8.s }, p1/Z, [x25, #7, MUL VL]\n" + "addvl x25, x25, #16\n" + "ld1w { z7.s }, p1/Z, [x23]\n" + "ld1w { z6.s }, p1/Z, [x25, #-8, MUL VL]\n" + "ld1w { z5.s }, p1/Z, [x25, #-7, MUL VL]\n" + "ld1w { z4.s }, p1/Z, [x25, #-6, MUL VL]\n" + "ld1w { z3.s }, p1/Z, [x25, #-5, MUL VL]\n" + "ld1w { z2.s }, p1/Z, [x25, #-4, MUL VL]\n" + "ld1w { z1.s }, p1/Z, [x25, #-3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x25, #-2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x25, #-1, MUL VL]\n" + "ld1w { z30.s }, p1/Z, [x23, #1, MUL VL]\n" + "ld1w { z29.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z28.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #4, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x23, #5, MUL VL]\n" + "ld1w { z25.s }, p1/Z, [x23, #6, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x23, #7, MUL VL]\n" + "addvl x23, x23, #16\n" + "ld1w { z23.s }, p1/Z, [x23, #-8, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x23, #-7, MUL VL]\n" + "ld1w { z21.s }, p1/Z, [x23, #-6, MUL VL]\n" + "ld1w { z20.s }, p1/Z, [x23, #-5, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #-4, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x23, #-3, MUL VL]\n" + "ld1w { z17.s }, p1/Z, [x23, #-2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x23, #-1, MUL VL]\n" + "st1w { z15.s }, p1, [x20]\n" + "st1w { z14.s }, p1, [x20, #1, MUL VL]\n" + "st1w { z13.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z12.s }, p1, [x20, #3, MUL VL]\n" + "st1w { z11.s }, p1, [x20, #4, MUL VL]\n" + "st1w { z10.s }, p1, [x20, #5, MUL VL]\n" + "st1w { z9.s }, p1, [x20, #6, MUL VL]\n" + "st1w { z8.s }, p1, [x20, #7, MUL VL]\n" + "addvl x20, x20, #16\n" + "st1w { z7.s }, p1, [x20, #-8, MUL VL]\n" + "st1w { z30.s }, p1, [x20, #-7, MUL VL]\n" + "st1w { z29.s }, p1, [x20, #-6, MUL VL]\n" + "st1w { z28.s }, p1, [x20, #-5, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #-4, MUL VL]\n" + "st1w { z26.s }, p1, [x20, #-3, MUL VL]\n" + "st1w { z25.s }, p1, [x20, #-2, MUL VL]\n" + "st1w { z24.s }, p1, [x20, #-1, MUL VL]\n" + "st1w { z6.s }, p1, [x19]\n" + "st1w { z5.s }, p1, [x19, #1, MUL VL]\n" + "st1w { z4.s }, p1, [x19, #2, MUL VL]\n" + "st1w { z3.s }, p1, [x19, #3, MUL VL]\n" + "st1w { z2.s }, p1, [x19, #4, MUL VL]\n" + "st1w { z1.s }, p1, [x19, #5, MUL VL]\n" + "st1w { z0.s }, p1, [x19, #6, MUL VL]\n" + "st1w { z31.s }, p1, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "st1w { z23.s }, p1, [x19, #-8, MUL VL]\n" + "st1w { z22.s }, p1, [x19, #-7, MUL VL]\n" + "st1w { z21.s }, p1, [x19, #-6, MUL VL]\n" + "st1w { z20.s }, p1, [x19, #-5, MUL VL]\n" + "st1w { z19.s }, p1, [x19, #-4, MUL VL]\n" + "st1w { z18.s }, p1, [x19, #-3, MUL VL]\n" + "st1w { z17.s }, p1, [x19, #-2, MUL VL]\n" + "st1w { z16.s }, p1, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x20, x22\n" + "mov x19, x24\n" + "whilelt p0.s, XZR, x20\n" + "ld1w { z31.s }, p0/Z, [x25]\n" + "ld1w { z30.s }, p0/Z, [x23]\n" + "decw x20\n" + "add x24, x24, %x[out_stride]\n" + "whilelt p0.s, XZR, x20\n" + "ld1w { z29.s }, p0/Z, [x25, #1, MUL VL]\n" + "ld1w { z28.s }, p0/Z, [x23, #1, MUL VL]\n" + "decw x20\n" + "decw x22, ALL, MUL #8\n" + "whilelt p0.s, XZR, x20\n" + "ld1w { z27.s }, p0/Z, [x25, #2, MUL VL]\n" + "ld1w { z26.s }, p0/Z, [x23, #2, MUL VL]\n" + "decw x20\n" + "whilelt p0.s, XZR, x20\n" + "ld1w { z25.s }, p0/Z, [x25, #3, MUL VL]\n" + "decw x20\n" + "ld1w { z24.s }, p0/Z, [x23, #3, MUL VL]\n" + "whilelt p0.s, XZR, x20\n" + "decw x20\n" + "ld1w { z23.s }, p0/Z, [x25, #4, MUL VL]\n" + "ld1w { z22.s }, p0/Z, [x23, #4, MUL VL]\n" + "whilelt p0.s, XZR, x20\n" + "decw x20\n" + "ld1w { z21.s }, p0/Z, [x25, #5, MUL VL]\n" + "ld1w { z20.s }, p0/Z, [x23, #5, MUL VL]\n" + "whilelt p0.s, XZR, x20\n" + "decw x20\n" + "ld1w { z19.s }, p0/Z, [x25, #6, MUL VL]\n" + "ld1w { z18.s }, p0/Z, [x23, #6, MUL VL]\n" + "whilelt p0.s, XZR, x20\n" + "cmp x22, #0x0\n" + "ld1w { z17.s }, p0/Z, [x25, #7, MUL VL]\n" + "ld1w { z16.s }, p0/Z, [x23, #7, MUL VL]\n" + "addvl x25, x25, #8\n" + "st1w { z31.s }, p1, [x19]\n" + "addvl x23, x23, #8\n" + "st1w { z29.s }, p1, [x19, #1, MUL VL]\n" + "st1w { z27.s }, p1, [x19, #2, MUL VL]\n" + "st1w { z25.s }, p1, [x19, #3, MUL VL]\n" + "st1w { z23.s }, p1, [x19, #4, MUL VL]\n" + "st1w { z21.s }, p1, [x19, #5, MUL VL]\n" + "st1w { z19.s }, p1, [x19, #6, MUL VL]\n" + "st1w { z17.s }, p1, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "st1w { z30.s }, p1, [x19, #-8, MUL VL]\n" + "st1w { z28.s }, p1, [x19, #-7, MUL VL]\n" + "st1w { z26.s }, p1, [x19, #-6, MUL VL]\n" + "st1w { z24.s }, p1, [x19, #-5, MUL VL]\n" + "st1w { z22.s }, p1, [x19, #-4, MUL VL]\n" + "st1w { z20.s }, p1, [x19, #-3, MUL VL]\n" + "st1w { z18.s }, p1, [x19, #-2, MUL VL]\n" + "st1w { z16.s }, p1, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #16\n" + "cmp %x[height], #0x2\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x25, %x[in]\n" + "mov x24, %x[out]\n" + "add %x[in], x25, %x[in_stride]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x20, %x[width]\n" + "cntw x19, ALL, MUL #16\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1w { z31.s }, p1/Z, [x25]\n" + "sub x20, x20, x19\n" + "ld1w { z30.s }, p1/Z, [x25, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1w { z29.s }, p1/Z, [x25, #2, MUL VL]\n" + "ld1w { z28.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x25, #4, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x25, #5, MUL VL]\n" + "ld1w { z25.s }, p1/Z, [x25, #6, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x25, #7, MUL VL]\n" + "addvl x25, x25, #16\n" + "ld1w { z23.s }, p1/Z, [x25, #-8, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x25, #-7, MUL VL]\n" + "ld1w { z21.s }, p1/Z, [x25, #-6, MUL VL]\n" + "ld1w { z20.s }, p1/Z, [x25, #-5, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x25, #-4, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x25, #-3, MUL VL]\n" + "ld1w { z17.s }, p1/Z, [x25, #-2, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x25, #-1, MUL VL]\n" + "st1w { z31.s }, p1, [x24]\n" + "st1w { z30.s }, p1, [x24, #1, MUL VL]\n" + "st1w { z29.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z28.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z27.s }, p1, [x24, #4, MUL VL]\n" + "st1w { z26.s }, p1, [x24, #5, MUL VL]\n" + "st1w { z25.s }, p1, [x24, #6, MUL VL]\n" + "st1w { z24.s }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "st1w { z23.s }, p1, [x24]\n" + "st1w { z22.s }, p1, [x24, #1, MUL VL]\n" + "st1w { z21.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z20.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z19.s }, p1, [x24, #4, MUL VL]\n" + "st1w { z18.s }, p1, [x24, #5, MUL VL]\n" + "st1w { z17.s }, p1, [x24, #6, MUL VL]\n" + "st1w { z16.s }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "mov x19, x20\n" + "decw x20, ALL, MUL #8\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z23.s }, p0/Z, [x25]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z22.s }, p0/Z, [x25, #1, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z21.s }, p0/Z, [x25, #2, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z20.s }, p0/Z, [x25, #3, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z19.s }, p0/Z, [x25, #4, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z18.s }, p0/Z, [x25, #5, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z17.s }, p0/Z, [x25, #6, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z16.s }, p0/Z, [x25, #7, MUL VL]\n" + "st1w { z23.s }, p1, [x24]\n" + "addvl x25, x25, #8\n" + "st1w { z22.s }, p1, [x24, #1, MUL VL]\n" + "cmp x20, #0x0\n" + "st1w { z21.s }, p1, [x24, #2, MUL VL]\n" + "st1w { z20.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z19.s }, p1, [x24, #4, MUL VL]\n" + "st1w { z18.s }, p1, [x24, #5, MUL VL]\n" + "st1w { z17.s }, p1, [x24, #6, MUL VL]\n" + "st1w { z16.s }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #8\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 1, true, VLType::SVE>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 4, + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp new file mode 100644 index 0000000000..f81098b26e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 8 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "1:" // Main row loop: Head + "mov x25, %x[in]\n" + "mov x24, %x[out]\n" + "add x23, x25, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add %x[in], x21, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x21, x21, %x[pad_row], GT\n" + "csel x22, x22, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x23, x23, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #8\n" + "cmp x20, x19\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1b { z8.b }, p1/Z, [x25]\n" + "sub x20, x20, x19\n" + "ld1b { z24.b }, p1/Z, [x25, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1b { z27.b }, p1/Z, [x25, #2, MUL VL]\n" + "ld1b { z25.b }, p1/Z, [x25, #3, MUL VL]\n" + "ld1b { z7.b }, p1/Z, [x25, #4, MUL VL]\n" + "ld1b { z3.b }, p1/Z, [x25, #5, MUL VL]\n" + "ld1b { z14.b }, p1/Z, [x25, #6, MUL VL]\n" + "ld1b { z13.b }, p1/Z, [x25, #7, MUL VL]\n" + "addvl x25, x25, #8\n" + "ld1b { z16.b }, p1/Z, [x23]\n" + "ld1b { z12.b }, p1/Z, [x23, #1, MUL VL]\n" + "ld1b { z15.b }, p1/Z, [x23, #2, MUL VL]\n" + "ld1b { z11.b }, p1/Z, [x23, #3, MUL VL]\n" + "ld1b { z4.b }, p1/Z, [x23, #4, MUL VL]\n" + "ld1b { z5.b }, p1/Z, [x23, #5, MUL VL]\n" + "ld1b { z26.b }, p1/Z, [x23, #6, MUL VL]\n" + "ld1b { z30.b }, p1/Z, [x23, #7, MUL VL]\n" + "addvl x23, x23, #8\n" + "ld1b { z22.b }, p1/Z, [x22]\n" + "zip1 z21.b, z8.b, z22.b\n" + "ld1b { z2.b }, p1/Z, [x22, #1, MUL VL]\n" + "zip2 z20.b, z8.b, z22.b\n" + "ld1b { z18.b }, p1/Z, [x22, #2, MUL VL]\n" + "ld1b { z17.b }, p1/Z, [x22, #3, MUL VL]\n" + "zip1 z10.b, z24.b, z2.b\n" + "ld1b { z22.b }, p1/Z, [x22, #4, MUL VL]\n" + "zip2 z9.b, z24.b, z2.b\n" + "ld1b { z6.b }, p1/Z, [x22, #5, MUL VL]\n" + "zip1 z0.b, z27.b, z18.b\n" + "ld1b { z1.b }, p1/Z, [x22, #6, MUL VL]\n" + "zip2 z28.b, z27.b, z18.b\n" + "ld1b { z23.b }, p1/Z, [x22, #7, MUL VL]\n" + "addvl x22, x22, #8\n" + "zip1 z31.b, z25.b, z17.b\n" + "ld1b { z19.b }, p1/Z, [x21]\n" + "zip2 z8.b, z25.b, z17.b\n" + "ld1b { z2.b }, p1/Z, [x21, #1, MUL VL]\n" + "zip1 z27.b, z7.b, z22.b\n" + "ld1b { z29.b }, p1/Z, [x21, #2, MUL VL]\n" + "zip2 z7.b, z7.b, z22.b\n" + "ld1b { z24.b }, p1/Z, [x21, #3, MUL VL]\n" + "zip1 z18.b, z16.b, z19.b\n" + "ld1b { z25.b }, p1/Z, [x21, #4, MUL VL]\n" + "zip1 z17.b, z21.b, z18.b\n" + "ld1b { z22.b }, p1/Z, [x21, #5, MUL VL]\n" + "zip2 z18.b, z21.b, z18.b\n" + "ld1b { z21.b }, p1/Z, [x21, #6, MUL VL]\n" + "zip2 z16.b, z16.b, z19.b\n" + "ld1b { z19.b }, p1/Z, [x21, #7, MUL VL]\n" + "addvl x21, x21, #8\n" + "st1b { z17.b }, p1, [x24]\n" + "zip1 z17.b, z20.b, z16.b\n" + "zip2 z20.b, z20.b, z16.b\n" + "st1b { z18.b }, p1, [x24, #1, MUL VL]\n" + "zip1 z16.b, z12.b, z2.b\n" + "st1b { z17.b }, p1, [x24, #2, MUL VL]\n" + "zip1 z17.b, z10.b, z16.b\n" + "st1b { z20.b }, p1, [x24, #3, MUL VL]\n" + "zip2 z16.b, z10.b, z16.b\n" + "st1b { z17.b }, p1, [x24, #4, MUL VL]\n" + "zip2 z17.b, z12.b, z2.b\n" + "st1b { z16.b }, p1, [x24, #5, MUL VL]\n" + "zip1 z16.b, z9.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #6, MUL VL]\n" + "zip2 z16.b, z9.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "zip1 z18.b, z15.b, z29.b\n" + "zip2 z17.b, z15.b, z29.b\n" + "zip1 z16.b, z0.b, z18.b\n" + "st1b { z16.b }, p1, [x24]\n" + "zip2 z16.b, z0.b, z18.b\n" + "st1b { z16.b }, p1, [x24, #1, MUL VL]\n" + "zip1 z16.b, z28.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #2, MUL VL]\n" + "zip2 z16.b, z28.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #3, MUL VL]\n" + "zip1 z17.b, z11.b, z24.b\n" + "zip1 z16.b, z31.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #4, MUL VL]\n" + "zip2 z16.b, z31.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #5, MUL VL]\n" + "zip2 z17.b, z11.b, z24.b\n" + "zip1 z16.b, z8.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #6, MUL VL]\n" + "zip2 z16.b, z8.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "zip1 z18.b, z4.b, z25.b\n" + "zip2 z17.b, z4.b, z25.b\n" + "zip1 z16.b, z27.b, z18.b\n" + "st1b { z16.b }, p1, [x24]\n" + "zip2 z16.b, z27.b, z18.b\n" + "st1b { z16.b }, p1, [x24, #1, MUL VL]\n" + "zip1 z16.b, z7.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #2, MUL VL]\n" + "zip2 z16.b, z7.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #3, MUL VL]\n" + "zip1 z18.b, z3.b, z6.b\n" + "zip1 z17.b, z5.b, z22.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #5, MUL VL]\n" + "zip2 z18.b, z3.b, z6.b\n" + "zip2 z17.b, z5.b, z22.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "zip1 z18.b, z14.b, z1.b\n" + "zip1 z17.b, z26.b, z21.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #1, MUL VL]\n" + "zip2 z18.b, z14.b, z1.b\n" + "zip2 z17.b, z26.b, z21.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #3, MUL VL]\n" + "zip1 z18.b, z13.b, z23.b\n" + "zip1 z17.b, z30.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #5, MUL VL]\n" + "zip2 z18.b, z13.b, z23.b\n" + "zip2 z17.b, z30.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x20, 5f\n" + "4:" // Main row loop: Column loop + "mov x19, x20\n" + "decw x20, ALL, MUL #8\n" + "whilelt p0.b, XZR, x19\n" + "ld1b { z17.b }, p0/Z, [x25]\n" + "ld1b { z25.b }, p0/Z, [x23]\n" + "decb x19\n" + "ld1b { z16.b }, p0/Z, [x22]\n" + "zip1 z18.b, z17.b, z16.b\n" + "ld1b { z24.b }, p0/Z, [x21]\n" + "whilelt p0.b, XZR, x19\n" + "zip2 z23.b, z17.b, z16.b\n" + "ld1b { z22.b }, p0/Z, [x25, #1, MUL VL]\n" + "addvl x25, x25, #2\n" + "zip1 z16.b, z25.b, z24.b\n" + "ld1b { z21.b }, p0/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "zip1 z17.b, z18.b, z16.b\n" + "ld1b { z20.b }, p0/Z, [x22, #1, MUL VL]\n" + "addvl x22, x22, #2\n" + "zip2 z18.b, z18.b, z16.b\n" + "ld1b { z19.b }, p0/Z, [x21, #1, MUL VL]\n" + "addvl x21, x21, #2\n" + "zip2 z16.b, z25.b, z24.b\n" + "st1b { z17.b }, p1, [x24]\n" + "cmp x20, #0x0\n" + "zip1 z17.b, z23.b, z16.b\n" + "st1b { z18.b }, p1, [x24, #1, MUL VL]\n" + "zip2 z16.b, z23.b, z16.b\n" + "st1b { z17.b }, p1, [x24, #2, MUL VL]\n" + "zip1 z18.b, z22.b, z20.b\n" + "st1b { z16.b }, p1, [x24, #3, MUL VL]\n" + "zip1 z17.b, z21.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #5, MUL VL]\n" + "zip2 z18.b, z22.b, z20.b\n" + "zip2 z17.b, z21.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x24, #7, MUL VL]\n" + "add x24, x24, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #8\n" + "cmp %x[height], #0x1\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 4, true, VLType::SVE>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<8, 4, true, VLType::SVE>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp new file mode 100644 index 0000000000..34d43f5052 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_8VL_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 8) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 8 * roundup(height, 8) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "1:" // Main row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[out]\n" + "add x27, x9, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add %x[in], x21, %x[in_stride]\n" + "cmp %x[height], #0x7\n" + "csel x21, x21, %x[pad_row], GT\n" + "csel x22, x22, %x[pad_row], GE\n" + "cmp %x[height], #0x5\n" + "csel x23, x23, %x[pad_row], GT\n" + "csel x24, x24, %x[pad_row], GE\n" + "cmp %x[height], #0x3\n" + "csel x25, x25, %x[pad_row], GT\n" + "csel x26, x26, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x27, x27, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x8\n" + "mov x20, %x[width]\n" + "cntb x19, ALL, MUL #2\n" + "cmp x20, x19\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1b { z17.b }, p1/Z, [x9]\n" + "sub x20, x20, x19\n" + "ld1b { z5.b }, p1/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "ld1b { z19.b }, p1/Z, [x27]\n" + "cmp x20, x19\n" + "ld1b { z4.b }, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "ld1b { z18.b }, p1/Z, [x26]\n" + "ld1b { z3.b }, p1/Z, [x26, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "ld1b { z2.b }, p1/Z, [x25]\n" + "ld1b { z1.b }, p1/Z, [x25, #1, MUL VL]\n" + "addvl x25, x25, #2\n" + "ld1b { z16.b }, p1/Z, [x24]\n" + "zip1 z0.b, z17.b, z16.b\n" + "ld1b { z31.b }, p1/Z, [x24, #1, MUL VL]\n" + "addvl x24, x24, #2\n" + "zip2 z30.b, z17.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x23]\n" + "ld1b { z29.b }, p1/Z, [x23, #1, MUL VL]\n" + "zip1 z28.b, z5.b, z31.b\n" + "ld1b { z16.b }, p1/Z, [x22]\n" + "addvl x23, x23, #2\n" + "zip1 z27.b, z19.b, z17.b\n" + "ld1b { z26.b }, p1/Z, [x22, #1, MUL VL]\n" + "addvl x22, x22, #2\n" + "zip2 z25.b, z19.b, z17.b\n" + "ld1b { z24.b }, p1/Z, [x21]\n" + "zip1 z22.b, z4.b, z29.b\n" + "ld1b { z23.b }, p1/Z, [x21, #1, MUL VL]\n" + "addvl x21, x21, #2\n" + "zip1 z21.b, z18.b, z16.b\n" + "zip2 z20.b, z18.b, z16.b\n" + "zip1 z18.b, z0.b, z21.b\n" + "zip1 z19.b, z2.b, z24.b\n" + "zip1 z17.b, z27.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z18.b, z0.b, z21.b\n" + "zip2 z17.b, z27.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z18.b, z30.b, z20.b\n" + "zip2 z19.b, z2.b, z24.b\n" + "zip1 z17.b, z25.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "zip2 z18.b, z30.b, z20.b\n" + "zip2 z17.b, z25.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #7, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "zip1 z20.b, z3.b, z26.b\n" + "zip1 z19.b, z1.b, z23.b\n" + "zip1 z18.b, z28.b, z20.b\n" + "zip1 z17.b, z22.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z18.b, z28.b, z20.b\n" + "zip2 z17.b, z22.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip2 z22.b, z5.b, z31.b\n" + "zip2 z21.b, z3.b, z26.b\n" + "zip1 z18.b, z22.b, z21.b\n" + "zip2 z20.b, z4.b, z29.b\n" + "zip2 z19.b, z1.b, z23.b\n" + "zip1 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "zip2 z18.b, z22.b, z21.b\n" + "zip2 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #7, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x20, 5f\n" + "4:" // Main row loop: Column loop + "whilelt p0.b, XZR, x20\n" + "ld1b { z18.b }, p0/Z, [x9]\n" + "addvl x9, x9, #1\n" + "ld1b { z28.b }, p0/Z, [x27]\n" + "addvl x27, x27, #1\n" + "ld1b { z17.b }, p0/Z, [x26]\n" + "addvl x26, x26, #1\n" + "ld1b { z27.b }, p0/Z, [x25]\n" + "addvl x25, x25, #1\n" + "ld1b { z16.b }, p0/Z, [x24]\n" + "zip1 z26.b, z18.b, z16.b\n" + "ld1b { z25.b }, p0/Z, [x23]\n" + "addvl x24, x24, #1\n" + "zip2 z24.b, z18.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x22]\n" + "addvl x23, x23, #1\n" + "zip1 z23.b, z28.b, z25.b\n" + "ld1b { z22.b }, p0/Z, [x21]\n" + "addvl x22, x22, #1\n" + "zip1 z20.b, z17.b, z16.b\n" + "addvl x21, x21, #1\n" + "zip2 z21.b, z17.b, z16.b\n" + "decd x20, ALL, MUL #8\n" + "zip1 z18.b, z26.b, z20.b\n" + "cmp x20, #0x0\n" + "zip1 z19.b, z27.b, z22.b\n" + "zip1 z17.b, z23.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #1, MUL VL]\n" + "zip2 z18.b, z26.b, z20.b\n" + "zip2 z17.b, z23.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #2, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #3, MUL VL]\n" + "zip1 z18.b, z24.b, z21.b\n" + "zip2 z20.b, z28.b, z25.b\n" + "zip2 z19.b, z27.b, z22.b\n" + "zip1 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #4, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #5, MUL VL]\n" + "zip2 z18.b, z24.b, z21.b\n" + "zip2 z17.b, z20.b, z19.b\n" + "zip1 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #6, MUL VL]\n" + "zip2 z16.b, z18.b, z17.b\n" + "st1b { z16.b }, p1, [x28, #7, MUL VL]\n" + "add x28, x28, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #8\n" + "cmp %x[height], #0x1\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 8, true, VLType::SVE>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<8, 8, true, VLType::SVE>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL_1x8( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp new file mode 100644 index 0000000000..7124f7e909 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 8 * roundup(height, 2) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p3.b\n" + "cmp %x[height], #0x4\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x27, %x[in]\n" + "mov x26, %x[out]\n" + "add x25, x27, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add %x[in], x23, %x[in_stride]\n" + "sub %x[height], %x[height], #0x4\n" + "mov x22, %x[width]\n" + "cnth x21, ALL, MUL #8\n" + "cmp x22, x21\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1h { z8.h }, p3/Z, [x27]\n" + "mov x20, x26\n" + "ld1h { z3.h }, p3/Z, [x27, #1, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "ld1h { z22.h }, p3/Z, [x27, #2, MUL VL]\n" + "mov x19, x26\n" + "ld1h { z12.h }, p3/Z, [x27, #3, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "ld1h { z4.h }, p3/Z, [x27, #4, MUL VL]\n" + "sub x22, x22, x21\n" + "ld1h { z25.h }, p3/Z, [x27, #5, MUL VL]\n" + "cmp x22, x21\n" + "ld1h { z15.h }, p3/Z, [x27, #6, MUL VL]\n" + "ld1h { z2.h }, p3/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #8\n" + "ld1h { z16.h }, p3/Z, [x25]\n" + "zip1 z21.h, z8.h, z16.h\n" + "ld1h { z27.h }, p3/Z, [x25, #1, MUL VL]\n" + "zip2 z7.h, z8.h, z16.h\n" + "ld1h { z18.h }, p3/Z, [x25, #2, MUL VL]\n" + "ld1h { z30.h }, p3/Z, [x25, #3, MUL VL]\n" + "zip1 z19.h, z3.h, z27.h\n" + "ld1h { z0.h }, p3/Z, [x25, #4, MUL VL]\n" + "zip2 z16.h, z3.h, z27.h\n" + "ld1h { z27.h }, p3/Z, [x25, #5, MUL VL]\n" + "zip1 z13.h, z22.h, z18.h\n" + "ld1h { z26.h }, p3/Z, [x25, #6, MUL VL]\n" + "zip2 z29.h, z22.h, z18.h\n" + "ld1h { z24.h }, p3/Z, [x25, #7, MUL VL]\n" + "addvl x25, x25, #8\n" + "zip1 z20.h, z12.h, z30.h\n" + "ld1h { z9.h }, p3/Z, [x24]\n" + "zip2 z14.h, z12.h, z30.h\n" + "ld1h { z12.h }, p3/Z, [x24, #1, MUL VL]\n" + "zip1 z5.h, z4.h, z0.h\n" + "ld1h { z31.h }, p3/Z, [x24, #2, MUL VL]\n" + "zip2 z1.h, z4.h, z0.h\n" + "ld1h { z22.h }, p3/Z, [x24, #3, MUL VL]\n" + "zip1 z10.h, z25.h, z27.h\n" + "ld1h { z3.h }, p3/Z, [x24, #4, MUL VL]\n" + "zip2 z6.h, z25.h, z27.h\n" + "ld1h { z4.h }, p3/Z, [x24, #5, MUL VL]\n" + "zip1 z8.h, z15.h, z26.h\n" + "ld1h { z25.h }, p3/Z, [x24, #6, MUL VL]\n" + "zip2 z11.h, z15.h, z26.h\n" + "ld1h { z30.h }, p3/Z, [x24, #7, MUL VL]\n" + "addvl x24, x24, #8\n" + "zip1 z17.h, z2.h, z24.h\n" + "ld1h { z23.h }, p3/Z, [x23]\n" + "zip2 z0.h, z2.h, z24.h\n" + "ld1h { z28.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z15.h }, p3/Z, [x23, #2, MUL VL]\n" + "zip1 z18.h, z9.h, z23.h\n" + "ld1h { z26.h }, p3/Z, [x23, #3, MUL VL]\n" + "zip2 z27.h, z9.h, z23.h\n" + "ld1h { z2.h }, p3/Z, [x23, #4, MUL VL]\n" + "zip1 z9.h, z12.h, z28.h\n" + "ld1h { z24.h }, p3/Z, [x23, #5, MUL VL]\n" + "zip2 z12.h, z12.h, z28.h\n" + "ld1h { z23.h }, p3/Z, [x23, #6, MUL VL]\n" + "zip1 z28.h, z31.h, z15.h\n" + "zip2 z31.h, z31.h, z15.h\n" + "ld1h { z15.h }, p3/Z, [x23, #7, MUL VL]\n" + "addvl x23, x23, #8\n" + "st1h { z21.h }, p3, [x20]\n" + "zip1 z21.h, z22.h, z26.h\n" + "zip2 z26.h, z22.h, z26.h\n" + "st1h { z7.h }, p3, [x20, #1, MUL VL]\n" + "zip1 z7.h, z3.h, z2.h\n" + "st1h { z19.h }, p3, [x20, #2, MUL VL]\n" + "zip2 z22.h, z3.h, z2.h\n" + "st1h { z16.h }, p3, [x20, #3, MUL VL]\n" + "zip1 z2.h, z4.h, z24.h\n" + "st1h { z13.h }, p3, [x20, #4, MUL VL]\n" + "zip2 z3.h, z4.h, z24.h\n" + "st1h { z29.h }, p3, [x20, #5, MUL VL]\n" + "zip1 z4.h, z25.h, z23.h\n" + "st1h { z20.h }, p3, [x20, #6, MUL VL]\n" + "zip2 z20.h, z25.h, z23.h\n" + "st1h { z14.h }, p3, [x20, #7, MUL VL]\n" + "addvl x20, x20, #16\n" + "zip1 z25.h, z30.h, z15.h\n" + "st1h { z18.h }, p3, [x20, #-8, MUL VL]\n" + "zip2 z18.h, z30.h, z15.h\n" + "st1h { z27.h }, p3, [x20, #-7, MUL VL]\n" + "st1h { z9.h }, p3, [x20, #-6, MUL VL]\n" + "st1h { z12.h }, p3, [x20, #-5, MUL VL]\n" + "st1h { z28.h }, p3, [x20, #-4, MUL VL]\n" + "st1h { z31.h }, p3, [x20, #-3, MUL VL]\n" + "st1h { z21.h }, p3, [x20, #-2, MUL VL]\n" + "st1h { z26.h }, p3, [x20, #-1, MUL VL]\n" + "st1h { z5.h }, p3, [x19]\n" + "st1h { z1.h }, p3, [x19, #1, MUL VL]\n" + "st1h { z10.h }, p3, [x19, #2, MUL VL]\n" + "st1h { z6.h }, p3, [x19, #3, MUL VL]\n" + "st1h { z8.h }, p3, [x19, #4, MUL VL]\n" + "st1h { z11.h }, p3, [x19, #5, MUL VL]\n" + "st1h { z17.h }, p3, [x19, #6, MUL VL]\n" + "st1h { z0.h }, p3, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "st1h { z7.h }, p3, [x19, #-8, MUL VL]\n" + "st1h { z22.h }, p3, [x19, #-7, MUL VL]\n" + "st1h { z2.h }, p3, [x19, #-6, MUL VL]\n" + "st1h { z3.h }, p3, [x19, #-5, MUL VL]\n" + "st1h { z4.h }, p3, [x19, #-4, MUL VL]\n" + "st1h { z20.h }, p3, [x19, #-3, MUL VL]\n" + "st1h { z25.h }, p3, [x19, #-2, MUL VL]\n" + "st1h { z18.h }, p3, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x20, x22\n" + "mov x19, x26\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z17.h }, p0/Z, [x27]\n" + "ld1h { z16.h }, p0/Z, [x25]\n" + "zip1 z0.h, z17.h, z16.h\n" + "ld1h { z18.h }, p0/Z, [x24]\n" + "dech x20\n" + "zip2 z31.h, z17.h, z16.h\n" + "ld1h { z16.h }, p0/Z, [x23]\n" + "whilelt p2.h, XZR, x20\n" + "zip1 z30.h, z18.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x27, #1, MUL VL]\n" + "dech x20\n" + "zip2 z29.h, z18.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x25, #1, MUL VL]\n" + "whilelt p1.h, XZR, x20\n" + "zip1 z28.h, z17.h, z16.h\n" + "ld1h { z18.h }, p1/Z, [x27, #2, MUL VL]\n" + "dech x20\n" + "zip2 z27.h, z17.h, z16.h\n" + "ld1h { z16.h }, p1/Z, [x25, #2, MUL VL]\n" + "whilelt p0.h, XZR, x20\n" + "zip1 z26.h, z18.h, z16.h\n" + "ld1h { z17.h }, p0/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "zip2 z25.h, z18.h, z16.h\n" + "ld1h { z16.h }, p0/Z, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "zip1 z24.h, z17.h, z16.h\n" + "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "zip2 z23.h, z17.h, z16.h\n" + "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" + "decw x22, ALL, MUL #8\n" + "ld1h { z22.h }, p0/Z, [x24, #3, MUL VL]\n" + "addvl x24, x24, #4\n" + "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n" + "zip1 z21.h, z19.h, z16.h\n" + "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n" + "cmp x22, #0x0\n" + "zip2 z20.h, z19.h, z16.h\n" + "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "zip1 z19.h, z18.h, z17.h\n" + "st1h { z0.h }, p3, [x19]\n" + "zip2 z18.h, z18.h, z17.h\n" + "st1h { z31.h }, p3, [x19, #1, MUL VL]\n" + "zip1 z17.h, z22.h, z16.h\n" + "st1h { z28.h }, p3, [x19, #2, MUL VL]\n" + "zip2 z16.h, z22.h, z16.h\n" + "st1h { z27.h }, p3, [x19, #3, MUL VL]\n" + "st1h { z26.h }, p3, [x19, #4, MUL VL]\n" + "st1h { z25.h }, p3, [x19, #5, MUL VL]\n" + "st1h { z24.h }, p3, [x19, #6, MUL VL]\n" + "st1h { z23.h }, p3, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "st1h { z30.h }, p3, [x19, #-8, MUL VL]\n" + "st1h { z29.h }, p3, [x19, #-7, MUL VL]\n" + "st1h { z21.h }, p3, [x19, #-6, MUL VL]\n" + "st1h { z20.h }, p3, [x19, #-5, MUL VL]\n" + "st1h { z19.h }, p3, [x19, #-4, MUL VL]\n" + "st1h { z18.h }, p3, [x19, #-3, MUL VL]\n" + "st1h { z17.h }, p3, [x19, #-2, MUL VL]\n" + "st1h { z16.h }, p3, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #16\n" + "cmp %x[height], #0x4\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x27, %x[in]\n" + "mov x26, %x[out]\n" + "add x25, x27, %x[in_stride]\n" + "add %x[in], x25, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "csel x25, x25, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x20, %x[width]\n" + "cnth x19, ALL, MUL #8\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1h { z17.h }, p3/Z, [x27]\n" + "sub x20, x20, x19\n" + "ld1h { z20.h }, p3/Z, [x27, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1h { z19.h }, p3/Z, [x27, #2, MUL VL]\n" + "ld1h { z1.h }, p3/Z, [x27, #3, MUL VL]\n" + "ld1h { z0.h }, p3/Z, [x27, #4, MUL VL]\n" + "ld1h { z31.h }, p3/Z, [x27, #5, MUL VL]\n" + "ld1h { z30.h }, p3/Z, [x27, #6, MUL VL]\n" + "ld1h { z29.h }, p3/Z, [x27, #7, MUL VL]\n" + "addvl x27, x27, #8\n" + "ld1h { z16.h }, p3/Z, [x25]\n" + "zip1 z28.h, z17.h, z16.h\n" + "ld1h { z18.h }, p3/Z, [x25, #1, MUL VL]\n" + "zip2 z27.h, z17.h, z16.h\n" + "ld1h { z17.h }, p3/Z, [x25, #2, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x25, #3, MUL VL]\n" + "zip1 z26.h, z20.h, z18.h\n" + "ld1h { z22.h }, p3/Z, [x25, #4, MUL VL]\n" + "zip2 z21.h, z20.h, z18.h\n" + "ld1h { z25.h }, p3/Z, [x25, #5, MUL VL]\n" + "zip1 z20.h, z19.h, z17.h\n" + "ld1h { z24.h }, p3/Z, [x25, #6, MUL VL]\n" + "zip2 z19.h, z19.h, z17.h\n" + "ld1h { z23.h }, p3/Z, [x25, #7, MUL VL]\n" + "addvl x25, x25, #8\n" + "zip1 z18.h, z1.h, z16.h\n" + "st1h { z28.h }, p3, [x26]\n" + "zip2 z17.h, z1.h, z16.h\n" + "st1h { z27.h }, p3, [x26, #1, MUL VL]\n" + "zip1 z16.h, z0.h, z22.h\n" + "st1h { z26.h }, p3, [x26, #2, MUL VL]\n" + "zip2 z22.h, z0.h, z22.h\n" + "st1h { z21.h }, p3, [x26, #3, MUL VL]\n" + "zip1 z21.h, z31.h, z25.h\n" + "st1h { z20.h }, p3, [x26, #4, MUL VL]\n" + "zip2 z20.h, z31.h, z25.h\n" + "st1h { z19.h }, p3, [x26, #5, MUL VL]\n" + "zip1 z19.h, z30.h, z24.h\n" + "st1h { z18.h }, p3, [x26, #6, MUL VL]\n" + "zip2 z18.h, z30.h, z24.h\n" + "st1h { z17.h }, p3, [x26, #7, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "zip1 z17.h, z29.h, z23.h\n" + "st1h { z16.h }, p3, [x26]\n" + "zip2 z16.h, z29.h, z23.h\n" + "st1h { z22.h }, p3, [x26, #1, MUL VL]\n" + "st1h { z21.h }, p3, [x26, #2, MUL VL]\n" + "st1h { z20.h }, p3, [x26, #3, MUL VL]\n" + "st1h { z19.h }, p3, [x26, #4, MUL VL]\n" + "st1h { z18.h }, p3, [x26, #5, MUL VL]\n" + "st1h { z17.h }, p3, [x26, #6, MUL VL]\n" + "st1h { z16.h }, p3, [x26, #7, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "mov x19, x20\n" + "decw x20, ALL, MUL #8\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x27]\n" + "ld1h { z16.h }, p0/Z, [x25]\n" + "zip1 z24.h, z17.h, z16.h\n" + "dech x19\n" + "zip2 z23.h, z17.h, z16.h\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z18.h }, p0/Z, [x27, #1, MUL VL]\n" + "dech x19\n" + "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n" + "zip1 z22.h, z18.h, z16.h\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x27, #2, MUL VL]\n" + "zip2 z21.h, z18.h, z16.h\n" + "dech x19\n" + "ld1h { z16.h }, p0/Z, [x25, #2, MUL VL]\n" + "zip1 z20.h, z17.h, z16.h\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z19.h }, p0/Z, [x27, #3, MUL VL]\n" + "zip2 z18.h, z17.h, z16.h\n" + "addvl x27, x27, #4\n" + "ld1h { z16.h }, p0/Z, [x25, #3, MUL VL]\n" + "zip1 z17.h, z19.h, z16.h\n" + "st1h { z24.h }, p3, [x26]\n" + "addvl x25, x25, #4\n" + "zip2 z16.h, z19.h, z16.h\n" + "st1h { z23.h }, p3, [x26, #1, MUL VL]\n" + "cmp x20, #0x0\n" + "st1h { z22.h }, p3, [x26, #2, MUL VL]\n" + "st1h { z21.h }, p3, [x26, #3, MUL VL]\n" + "st1h { z20.h }, p3, [x26, #4, MUL VL]\n" + "st1h { z18.h }, p3, [x26, #5, MUL VL]\n" + "st1h { z17.h }, p3, [x26, #6, MUL VL]\n" + "st1h { z16.h }, p3, [x26, #7, MUL VL]\n" + "add x26, x26, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #8\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 2, true, VLType::SVE>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp new file mode 100644 index 0000000000..891e3abeb0 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 8 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p2.b\n" + "cmp %x[height], #0x8\n" + "blt 6f\n" + "1:" // Main row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add %x[in], x23, %x[in_stride]\n" + "sub %x[height], %x[height], #0x8\n" + "mov x22, %x[width]\n" + "cnth x21, ALL, MUL #4\n" + "cmp x22, x21\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1h { z21.h }, p2/Z, [x11]\n" + "mov x20, x10\n" + "ld1h { z24.h }, p2/Z, [x11, #1, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "ld1h { z8.h }, p2/Z, [x11, #2, MUL VL]\n" + "mov x19, x10\n" + "ld1h { z11.h }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "ld1h { z3.h }, p2/Z, [x9]\n" + "add x10, x10, %x[out_stride]\n" + "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n" + "sub x22, x22, x21\n" + "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n" + "cmp x22, x21\n" + "ld1h { z12.h }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "ld1h { z16.h }, p2/Z, [x28]\n" + "zip1 z22.h, z21.h, z16.h\n" + "ld1h { z19.h }, p2/Z, [x28, #1, MUL VL]\n" + "zip2 z21.h, z21.h, z16.h\n" + "ld1h { z26.h }, p2/Z, [x28, #2, MUL VL]\n" + "ld1h { z13.h }, p2/Z, [x28, #3, MUL VL]\n" + "zip1 z14.h, z24.h, z19.h\n" + "ld1h { z16.h }, p2/Z, [x27]\n" + "addvl x28, x28, #4\n" + "zip2 z24.h, z24.h, z19.h\n" + "ld1h { z27.h }, p2/Z, [x27, #1, MUL VL]\n" + "zip1 z17.h, z8.h, z26.h\n" + "ld1h { z15.h }, p2/Z, [x27, #2, MUL VL]\n" + "zip2 z9.h, z8.h, z26.h\n" + "ld1h { z5.h }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "zip1 z2.h, z3.h, z16.h\n" + "ld1h { z4.h }, p2/Z, [x26]\n" + "zip2 z16.h, z3.h, z16.h\n" + "ld1h { z23.h }, p2/Z, [x26, #1, MUL VL]\n" + "zip1 z20.h, z22.h, z2.h\n" + "ld1h { z10.h }, p2/Z, [x26, #2, MUL VL]\n" + "zip2 z28.h, z22.h, z2.h\n" + "ld1h { z8.h }, p2/Z, [x26, #3, MUL VL]\n" + "addvl x26, x26, #4\n" + "zip1 z26.h, z21.h, z16.h\n" + "ld1h { z25.h }, p2/Z, [x25]\n" + "zip2 z7.h, z21.h, z16.h\n" + "ld1h { z31.h }, p2/Z, [x25, #1, MUL VL]\n" + "zip1 z3.h, z0.h, z27.h\n" + "ld1h { z16.h }, p2/Z, [x25, #2, MUL VL]\n" + "zip1 z22.h, z14.h, z3.h\n" + "ld1h { z6.h }, p2/Z, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "zip2 z19.h, z14.h, z3.h\n" + "ld1h { z2.h }, p2/Z, [x24]\n" + "zip2 z14.h, z0.h, z27.h\n" + "ld1h { z21.h }, p2/Z, [x24, #1, MUL VL]\n" + "zip1 z29.h, z24.h, z14.h\n" + "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n" + "zip2 z27.h, z24.h, z14.h\n" + "ld1h { z1.h }, p2/Z, [x24, #3, MUL VL]\n" + "addvl x24, x24, #4\n" + "zip1 z30.h, z4.h, z2.h\n" + "ld1h { z3.h }, p2/Z, [x23]\n" + "zip2 z14.h, z4.h, z2.h\n" + "ld1h { z4.h }, p2/Z, [x23, #1, MUL VL]\n" + "zip1 z2.h, z23.h, z21.h\n" + "ld1h { z24.h }, p2/Z, [x23, #2, MUL VL]\n" + "zip2 z21.h, z23.h, z21.h\n" + "ld1h { z23.h }, p2/Z, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "st1h { z20.h }, p2, [x20]\n" + "zip1 z20.h, z25.h, z3.h\n" + "zip2 z3.h, z25.h, z3.h\n" + "st1h { z28.h }, p2, [x20, #1, MUL VL]\n" + "zip1 z28.h, z30.h, z20.h\n" + "st1h { z26.h }, p2, [x20, #2, MUL VL]\n" + "zip2 z20.h, z30.h, z20.h\n" + "st1h { z7.h }, p2, [x20, #3, MUL VL]\n" + "zip1 z25.h, z14.h, z3.h\n" + "st1h { z22.h }, p2, [x20, #4, MUL VL]\n" + "zip2 z7.h, z14.h, z3.h\n" + "st1h { z19.h }, p2, [x20, #5, MUL VL]\n" + "zip1 z14.h, z31.h, z4.h\n" + "st1h { z29.h }, p2, [x20, #6, MUL VL]\n" + "zip1 z19.h, z2.h, z14.h\n" + "st1h { z27.h }, p2, [x20, #7, MUL VL]\n" + "addvl x20, x20, #16\n" + "zip2 z29.h, z2.h, z14.h\n" + "st1h { z28.h }, p2, [x20, #-8, MUL VL]\n" + "zip2 z27.h, z31.h, z4.h\n" + "st1h { z20.h }, p2, [x20, #-7, MUL VL]\n" + "zip1 z30.h, z21.h, z27.h\n" + "st1h { z25.h }, p2, [x20, #-6, MUL VL]\n" + "zip2 z20.h, z21.h, z27.h\n" + "st1h { z7.h }, p2, [x20, #-5, MUL VL]\n" + "zip1 z14.h, z18.h, z15.h\n" + "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n" + "zip1 z19.h, z17.h, z14.h\n" + "st1h { z29.h }, p2, [x20, #-3, MUL VL]\n" + "zip2 z7.h, z17.h, z14.h\n" + "st1h { z30.h }, p2, [x20, #-2, MUL VL]\n" + "zip2 z14.h, z18.h, z15.h\n" + "st1h { z20.h }, p2, [x20, #-1, MUL VL]\n" + "zip1 z17.h, z9.h, z14.h\n" + "st1h { z19.h }, p2, [x19]\n" + "zip2 z27.h, z9.h, z14.h\n" + "st1h { z7.h }, p2, [x19, #1, MUL VL]\n" + "zip1 z18.h, z11.h, z13.h\n" + "st1h { z17.h }, p2, [x19, #2, MUL VL]\n" + "zip1 z17.h, z12.h, z5.h\n" + "st1h { z27.h }, p2, [x19, #3, MUL VL]\n" + "zip1 z20.h, z18.h, z17.h\n" + "st1h { z20.h }, p2, [x19, #4, MUL VL]\n" + "zip2 z18.h, z18.h, z17.h\n" + "st1h { z18.h }, p2, [x19, #5, MUL VL]\n" + "zip2 z18.h, z11.h, z13.h\n" + "zip2 z17.h, z12.h, z5.h\n" + "zip1 z29.h, z18.h, z17.h\n" + "st1h { z29.h }, p2, [x19, #6, MUL VL]\n" + "zip2 z17.h, z18.h, z17.h\n" + "st1h { z17.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "zip1 z18.h, z10.h, z0.h\n" + "zip1 z17.h, z16.h, z24.h\n" + "zip1 z30.h, z18.h, z17.h\n" + "st1h { z30.h }, p2, [x19, #-8, MUL VL]\n" + "zip2 z30.h, z18.h, z17.h\n" + "st1h { z30.h }, p2, [x19, #-7, MUL VL]\n" + "zip2 z18.h, z10.h, z0.h\n" + "zip2 z17.h, z16.h, z24.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-6, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-5, MUL VL]\n" + "zip1 z18.h, z8.h, z1.h\n" + "zip1 z17.h, z6.h, z23.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-3, MUL VL]\n" + "zip2 z18.h, z8.h, z1.h\n" + "zip2 z17.h, z6.h, z23.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-2, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x20, x22\n" + "mov x19, x10\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z17.h }, p1/Z, [x11]\n" + "ld1h { z22.h }, p1/Z, [x9]\n" + "dech x20\n" + "ld1h { z16.h }, p1/Z, [x28]\n" + "zip1 z19.h, z17.h, z16.h\n" + "ld1h { z18.h }, p1/Z, [x27]\n" + "whilelt p0.h, XZR, x20\n" + "zip2 z17.h, z17.h, z16.h\n" + "ld1h { z21.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #2\n" + "zip1 z16.h, z22.h, z18.h\n" + "ld1h { z2.h }, p0/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "zip1 z1.h, z19.h, z16.h\n" + "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n" + "addvl x28, x28, #2\n" + "zip2 z0.h, z19.h, z16.h\n" + "ld1h { z19.h }, p0/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "zip2 z16.h, z22.h, z18.h\n" + "ld1h { z31.h }, p1/Z, [x26]\n" + "add x10, x10, %x[out_stride]\n" + "zip1 z30.h, z17.h, z16.h\n" + "ld1h { z29.h }, p0/Z, [x26, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "zip2 z28.h, z17.h, z16.h\n" + "ld1h { z27.h }, p1/Z, [x25]\n" + "decd x22, ALL, MUL #8\n" + "zip1 z17.h, z21.h, z20.h\n" + "ld1h { z26.h }, p0/Z, [x25, #1, MUL VL]\n" + "addvl x25, x25, #2\n" + "zip1 z16.h, z2.h, z19.h\n" + "ld1h { z25.h }, p1/Z, [x24]\n" + "cmp x22, #0x0\n" + "zip1 z18.h, z17.h, z16.h\n" + "ld1h { z24.h }, p0/Z, [x24, #1, MUL VL]\n" + "addvl x24, x24, #2\n" + "zip2 z23.h, z17.h, z16.h\n" + "ld1h { z22.h }, p1/Z, [x23]\n" + "zip2 z17.h, z21.h, z20.h\n" + "ld1h { z21.h }, p0/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "zip2 z16.h, z2.h, z19.h\n" + "st1h { z1.h }, p2, [x19]\n" + "zip1 z20.h, z31.h, z25.h\n" + "st1h { z0.h }, p2, [x19, #1, MUL VL]\n" + "zip1 z19.h, z17.h, z16.h\n" + "st1h { z30.h }, p2, [x19, #2, MUL VL]\n" + "zip2 z17.h, z17.h, z16.h\n" + "st1h { z28.h }, p2, [x19, #3, MUL VL]\n" + "zip1 z16.h, z27.h, z22.h\n" + "st1h { z18.h }, p2, [x19, #4, MUL VL]\n" + "zip1 z18.h, z20.h, z16.h\n" + "st1h { z23.h }, p2, [x19, #5, MUL VL]\n" + "zip2 z16.h, z20.h, z16.h\n" + "st1h { z19.h }, p2, [x19, #6, MUL VL]\n" + "zip2 z19.h, z31.h, z25.h\n" + "st1h { z17.h }, p2, [x19, #7, MUL VL]\n" + "addvl x19, x19, #16\n" + "zip2 z17.h, z27.h, z22.h\n" + "st1h { z18.h }, p2, [x19, #-8, MUL VL]\n" + "zip1 z18.h, z29.h, z24.h\n" + "st1h { z16.h }, p2, [x19, #-7, MUL VL]\n" + "zip1 z16.h, z19.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-6, MUL VL]\n" + "zip2 z16.h, z19.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-5, MUL VL]\n" + "zip1 z17.h, z26.h, z21.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-3, MUL VL]\n" + "zip2 z18.h, z29.h, z24.h\n" + "zip2 z17.h, z26.h, z21.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-2, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "addvl %x[out], %x[out], #16\n" + "cmp %x[height], #0x8\n" + "bge 1b\n" + "cbz %x[height], 12f\n" + "6:" // Main loop skip + + "7:" // Tail row loop: Head + "mov x11, %x[in]\n" + "mov x10, %x[out]\n" + "add x9, x11, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add %x[in], x27, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "csel x27, x27, %x[pad_row], GT\n" + "csel x28, x28, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x9, x9, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x20, %x[width]\n" + "cnth x19, ALL, MUL #4\n" + "cmp x20, x19\n" + "blt 9f\n" + "8:" // Tail row loop: Unroll column loop + "ld1h { z17.h }, p2/Z, [x11]\n" + "sub x20, x20, x19\n" + "ld1h { z20.h }, p2/Z, [x11, #1, MUL VL]\n" + "cmp x20, x19\n" + "ld1h { z19.h }, p2/Z, [x11, #2, MUL VL]\n" + "ld1h { z1.h }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "ld1h { z0.h }, p2/Z, [x9]\n" + "ld1h { z31.h }, p2/Z, [x9, #1, MUL VL]\n" + "ld1h { z30.h }, p2/Z, [x9, #2, MUL VL]\n" + "ld1h { z29.h }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "ld1h { z16.h }, p2/Z, [x28]\n" + "zip1 z28.h, z17.h, z16.h\n" + "ld1h { z18.h }, p2/Z, [x28, #1, MUL VL]\n" + "zip2 z27.h, z17.h, z16.h\n" + "ld1h { z17.h }, p2/Z, [x28, #2, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x28, #3, MUL VL]\n" + "zip1 z25.h, z20.h, z18.h\n" + "ld1h { z16.h }, p2/Z, [x27]\n" + "addvl x28, x28, #4\n" + "zip2 z24.h, z20.h, z18.h\n" + "ld1h { z23.h }, p2/Z, [x27, #1, MUL VL]\n" + "zip1 z22.h, z19.h, z17.h\n" + "ld1h { z21.h }, p2/Z, [x27, #2, MUL VL]\n" + "zip2 z20.h, z19.h, z17.h\n" + "ld1h { z19.h }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "zip1 z18.h, z0.h, z16.h\n" + "zip2 z17.h, z0.h, z16.h\n" + "zip1 z16.h, z28.h, z18.h\n" + "st1h { z16.h }, p2, [x10]\n" + "zip2 z16.h, z28.h, z18.h\n" + "st1h { z16.h }, p2, [x10, #1, MUL VL]\n" + "zip1 z16.h, z27.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #2, MUL VL]\n" + "zip2 z16.h, z27.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "zip1 z17.h, z31.h, z23.h\n" + "zip1 z16.h, z25.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #4, MUL VL]\n" + "zip2 z16.h, z25.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #5, MUL VL]\n" + "zip2 z17.h, z31.h, z23.h\n" + "zip1 z16.h, z24.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #6, MUL VL]\n" + "zip2 z16.h, z24.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #7, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "zip1 z18.h, z30.h, z21.h\n" + "zip2 z17.h, z30.h, z21.h\n" + "zip1 z16.h, z22.h, z18.h\n" + "st1h { z16.h }, p2, [x10]\n" + "zip2 z16.h, z22.h, z18.h\n" + "st1h { z16.h }, p2, [x10, #1, MUL VL]\n" + "zip1 z16.h, z20.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #2, MUL VL]\n" + "zip2 z16.h, z20.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "zip1 z18.h, z1.h, z26.h\n" + "zip1 z17.h, z29.h, z19.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #5, MUL VL]\n" + "zip2 z18.h, z1.h, z26.h\n" + "zip2 z17.h, z29.h, z19.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #6, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #7, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "bge 8b\n" + "9:" // Tail row loop: Unroll column loop skip + "cbz x20, 11f\n" + "10:" // Tail row loop: Column loop + "mov x19, x20\n" + "decd x20, ALL, MUL #8\n" + "whilelt p0.h, XZR, x19\n" + "ld1h { z17.h }, p0/Z, [x11]\n" + "ld1h { z25.h }, p0/Z, [x9]\n" + "dech x19\n" + "ld1h { z16.h }, p0/Z, [x28]\n" + "zip1 z18.h, z17.h, z16.h\n" + "ld1h { z24.h }, p0/Z, [x27]\n" + "whilelt p0.h, XZR, x19\n" + "zip2 z23.h, z17.h, z16.h\n" + "ld1h { z22.h }, p0/Z, [x11, #1, MUL VL]\n" + "addvl x11, x11, #2\n" + "zip1 z16.h, z25.h, z24.h\n" + "ld1h { z21.h }, p0/Z, [x9, #1, MUL VL]\n" + "addvl x9, x9, #2\n" + "zip1 z17.h, z18.h, z16.h\n" + "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n" + "addvl x28, x28, #2\n" + "zip2 z18.h, z18.h, z16.h\n" + "ld1h { z19.h }, p0/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "zip2 z16.h, z25.h, z24.h\n" + "st1h { z17.h }, p2, [x10]\n" + "cmp x20, #0x0\n" + "zip1 z17.h, z23.h, z16.h\n" + "st1h { z18.h }, p2, [x10, #1, MUL VL]\n" + "zip2 z16.h, z23.h, z16.h\n" + "st1h { z17.h }, p2, [x10, #2, MUL VL]\n" + "zip1 z18.h, z22.h, z20.h\n" + "st1h { z16.h }, p2, [x10, #3, MUL VL]\n" + "zip1 z17.h, z21.h, z19.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #4, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #5, MUL VL]\n" + "zip2 z18.h, z22.h, z20.h\n" + "zip2 z17.h, z21.h, z19.h\n" + "zip1 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #6, MUL VL]\n" + "zip2 z16.h, z18.h, z17.h\n" + "st1h { z16.h }, p2, [x10, #7, MUL VL]\n" + "add x10, x10, %x[out_stride]\n" + "bgt 10b\n" + "11:" // Tail row loop: Column loop skip + "addvl %x[out], %x[out], #8\n" + "cmp %x[height], #0x1\n" + "bge 7b\n" + "12:" // Done + + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 4, true, VLType::SVE>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL_2x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..1313479dbc --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +namespace { + +void sve_transpose_interleave_8VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 8 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p4.b\n" + "1:" // Main row loop: Head + "mov x25, %x[in]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "mov x22, %x[width]\n" + "cnth x19, ALL, MUL #4\n" + "add x21, x23, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "add %x[in], x21, %x[in_stride]\n" + "csel x21, x21, %x[pad_row], GT\n" + "csel x23, x23, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x24, x24, %x[pad_row], GT\n" + "cmp x22, x19\n" + "mov x20, %x[out]\n" + "sub %x[height], %x[height], #0x4\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1w { z19.s }, p4/Z, [x25]\n" + "ld1w { z18.s }, p4/Z, [x25, #1, MUL VL]\n" + "sub x22, x22, x19\n" + "cmp x22, x19\n" + "ld1w { z20.s }, p4/Z, [x25, #2, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x25, #3, MUL VL]\n" + "ld1w { z23.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p4/Z, [x23, #1, MUL VL]\n" + "zip1 z22.s, z19.s, z23.s\n" + "zip2 z21.s, z19.s, z23.s\n" + "ld1w { z31.s }, p4/Z, [x23, #2, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23, #3, MUL VL]\n" + "zip1 z9.s, z18.s, z17.s\n" + "zip2 z7.s, z18.s, z17.s\n" + "ld1w { z19.s }, p4/Z, [x25, #4, MUL VL]\n" + "ld1w { z18.s }, p4/Z, [x25, #5, MUL VL]\n" + "zip1 z6.s, z20.s, z31.s\n" + "zip2 z5.s, z20.s, z31.s\n" + "ld1w { z15.s }, p4/Z, [x25, #6, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25, #7, MUL VL]\n" + "zip1 z3.s, z24.s, z16.s\n" + "zip2 z2.s, z24.s, z16.s\n" + "ld1w { z16.s }, p4/Z, [x23, #4, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x23, #5, MUL VL]\n" + "zip1 z1.s, z19.s, z16.s\n" + "zip2 z0.s, z19.s, z16.s\n" + "ld1w { z16.s }, p4/Z, [x23, #6, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x23, #7, MUL VL]\n" + "zip1 z31.s, z18.s, z17.s\n" + "zip2 z30.s, z18.s, z17.s\n" + "ld1w { z18.s }, p4/Z, [x24]\n" + "ld1w { z17.s }, p4/Z, [x24, #1, MUL VL]\n" + "zip1 z29.s, z15.s, z16.s\n" + "zip2 z28.s, z15.s, z16.s\n" + "ld1w { z16.s }, p4/Z, [x24, #2, MUL VL]\n" + "ld1w { z23.s }, p4/Z, [x24, #3, MUL VL]\n" + "zip1 z27.s, z20.s, z19.s\n" + "zip2 z26.s, z20.s, z19.s\n" + "ld1w { z11.s }, p4/Z, [x21]\n" + "ld1w { z8.s }, p4/Z, [x21, #1, MUL VL]\n" + ".inst 0x658ab2d8 // bfcvt z24.h, p4/M, z22.s\n" + "zip1 z25.s, z18.s, z11.s\n" + "ld1w { z4.s }, p4/Z, [x21, #2, MUL VL]\n" + "ld1w { z22.s }, p4/Z, [x21, #3, MUL VL]\n" + ".inst 0x658ab2af // bfcvt z15.h, p4/M, z21.s\n" + "zip2 z14.s, z18.s, z11.s\n" + "ld1w { z21.s }, p4/Z, [x24, #4, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x24, #5, MUL VL]\n" + ".inst 0x658ab12d // bfcvt z13.h, p4/M, z9.s\n" + "zip1 z12.s, z17.s, z8.s\n" + "ld1w { z11.s }, p4/Z, [x24, #6, MUL VL]\n" + "ld1w { z10.s }, p4/Z, [x24, #7, MUL VL]\n" + ".inst 0x658ab0e9 // bfcvt z9.h, p4/M, z7.s\n" + "zip2 z8.s, z17.s, z8.s\n" + "ld1w { z19.s }, p4/Z, [x21, #4, MUL VL]\n" + "ld1w { z18.s }, p4/Z, [x21, #5, MUL VL]\n" + ".inst 0x658ab0c7 // bfcvt z7.h, p4/M, z6.s\n" + "zip1 z6.s, z16.s, z4.s\n" + "ld1w { z17.s }, p4/Z, [x21, #6, MUL VL]\n" + ".inst 0x658ab0a5 // bfcvt z5.h, p4/M, z5.s\n" + "zip2 z4.s, z16.s, z4.s\n" + "ld1w { z16.s }, p4/Z, [x21, #7, MUL VL]\n" + ".inst 0x658ab063 // bfcvt z3.h, p4/M, z3.s\n" + ".inst 0x658ab042 // bfcvt z2.h, p4/M, z2.s\n" + "addvl x25, x25, #8\n" + "addvl x24, x24, #8\n" + ".inst 0x658ab021 // bfcvt z1.h, p4/M, z1.s\n" + ".inst 0x658ab000 // bfcvt z0.h, p4/M, z0.s\n" + "addvl x23, x23, #8\n" + "addvl x21, x21, #8\n" + ".inst 0x658ab3ff // bfcvt z31.h, p4/M, z31.s\n" + ".inst 0x658ab3de // bfcvt z30.h, p4/M, z30.s\n" + ".inst 0x658ab3bd // bfcvt z29.h, p4/M, z29.s\n" + ".inst 0x658ab39c // bfcvt z28.h, p4/M, z28.s\n" + ".inst 0x658ab37b // bfcvt z27.h, p4/M, z27.s\n" + ".inst 0x658ab35a // bfcvt z26.h, p4/M, z26.s\n" + ".inst 0x648ab338 // bfcvtnt z24.h, p4/M, z25.s\n" + "zip1 z25.s, z23.s, z22.s\n" + "st1h { z24.h }, p4, [x20]\n" + "zip2 z24.s, z23.s, z22.s\n" + "zip1 z23.s, z21.s, z19.s\n" + "zip2 z22.s, z21.s, z19.s\n" + "zip1 z21.s, z20.s, z18.s\n" + "zip2 z20.s, z20.s, z18.s\n" + "zip1 z19.s, z11.s, z17.s\n" + "zip2 z18.s, z11.s, z17.s\n" + "zip1 z17.s, z10.s, z16.s\n" + "zip2 z16.s, z10.s, z16.s\n" + ".inst 0x648ab1cf // bfcvtnt z15.h, p4/M, z14.s\n" + "st1h { z15.h }, p4, [x20, #1, MUL VL]\n" + ".inst 0x648ab18d // bfcvtnt z13.h, p4/M, z12.s\n" + ".inst 0x648ab109 // bfcvtnt z9.h, p4/M, z8.s\n" + "st1h { z13.h }, p4, [x20, #2, MUL VL]\n" + ".inst 0x648ab0c7 // bfcvtnt z7.h, p4/M, z6.s\n" + ".inst 0x648ab085 // bfcvtnt z5.h, p4/M, z4.s\n" + "st1h { z9.h }, p4, [x20, #3, MUL VL]\n" + ".inst 0x648ab323 // bfcvtnt z3.h, p4/M, z25.s\n" + ".inst 0x648ab302 // bfcvtnt z2.h, p4/M, z24.s\n" + "st1h { z7.h }, p4, [x20, #4, MUL VL]\n" + "st1h { z5.h }, p4, [x20, #5, MUL VL]\n" + ".inst 0x648ab2e1 // bfcvtnt z1.h, p4/M, z23.s\n" + ".inst 0x648ab2c0 // bfcvtnt z0.h, p4/M, z22.s\n" + "st1h { z3.h }, p4, [x20, #6, MUL VL]\n" + ".inst 0x648ab2bf // bfcvtnt z31.h, p4/M, z21.s\n" + ".inst 0x648ab29e // bfcvtnt z30.h, p4/M, z20.s\n" + "st1h { z2.h }, p4, [x20, #7, MUL VL]\n" + "add x20, x20, %x[out_stride]\n" + ".inst 0x648ab27d // bfcvtnt z29.h, p4/M, z19.s\n" + ".inst 0x648ab25c // bfcvtnt z28.h, p4/M, z18.s\n" + ".inst 0x648ab23b // bfcvtnt z27.h, p4/M, z17.s\n" + ".inst 0x648ab21a // bfcvtnt z26.h, p4/M, z16.s\n" + "st1h { z1.h }, p4, [x20]\n" + "st1h { z0.h }, p4, [x20, #1, MUL VL]\n" + "st1h { z31.h }, p4, [x20, #2, MUL VL]\n" + "st1h { z30.h }, p4, [x20, #3, MUL VL]\n" + "st1h { z29.h }, p4, [x20, #4, MUL VL]\n" + "st1h { z28.h }, p4, [x20, #5, MUL VL]\n" + "st1h { z27.h }, p4, [x20, #6, MUL VL]\n" + "st1h { z26.h }, p4, [x20, #7, MUL VL]\n" + "add x20, x20, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x22, 5f\n" + "4:" // Main row loop: Column loop + "mov x19, x22\n" + "whilelt p3.s, XZR, x19\n" + "ld1w { z22.s }, p3/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x23]\n" + "decw x19\n" + "whilelt p2.s, XZR, x19\n" + "ld1w { z20.s }, p2/Z, [x25, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x23, #1, MUL VL]\n" + "decw x19\n" + "whilelt p1.s, XZR, x19\n" + "ld1w { z18.s }, p1/Z, [x25, #2, MUL VL]\n" + "ld1w { z17.s }, p1/Z, [x23, #2, MUL VL]\n" + "decw x19\n" + "whilelt p0.s, XZR, x19\n" + "ld1w { z28.s }, p0/Z, [x25, #3, MUL VL]\n" + "ld1w { z16.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z27.s }, p3/Z, [x24]\n" + "ld1w { z3.s }, p2/Z, [x24, #1, MUL VL]\n" + "zip1 z26.s, z22.s, z21.s\n" + "zip2 z25.s, z22.s, z21.s\n" + "ld1w { z2.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z1.s }, p0/Z, [x24, #3, MUL VL]\n" + "zip1 z24.s, z20.s, z19.s\n" + "zip2 z23.s, z20.s, z19.s\n" + "ld1w { z22.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "zip1 z20.s, z18.s, z17.s\n" + "zip2 z19.s, z18.s, z17.s\n" + "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z0.s }, p0/Z, [x21, #3, MUL VL]\n" + "zip1 z17.s, z28.s, z16.s\n" + "zip2 z16.s, z28.s, z16.s\n" + "decd x22, ALL, MUL #8\n" + ".inst 0x658ab35f // bfcvt z31.h, p4/M, z26.s\n" + "zip1 z30.s, z27.s, z22.s\n" + "cmp x22, #0x0\n" + ".inst 0x658ab33d // bfcvt z29.h, p4/M, z25.s\n" + "zip2 z28.s, z27.s, z22.s\n" + "addvl x25, x25, #4\n" + "addvl x24, x24, #4\n" + ".inst 0x658ab31b // bfcvt z27.h, p4/M, z24.s\n" + "zip1 z26.s, z3.s, z21.s\n" + "addvl x23, x23, #4\n" + "addvl x21, x21, #4\n" + ".inst 0x658ab2f9 // bfcvt z25.h, p4/M, z23.s\n" + "zip2 z24.s, z3.s, z21.s\n" + ".inst 0x658ab297 // bfcvt z23.h, p4/M, z20.s\n" + "zip1 z22.s, z2.s, z18.s\n" + ".inst 0x658ab275 // bfcvt z21.h, p4/M, z19.s\n" + "zip2 z20.s, z2.s, z18.s\n" + ".inst 0x658ab233 // bfcvt z19.h, p4/M, z17.s\n" + "zip1 z18.s, z1.s, z0.s\n" + ".inst 0x658ab211 // bfcvt z17.h, p4/M, z16.s\n" + "zip2 z16.s, z1.s, z0.s\n" + ".inst 0x648ab3df // bfcvtnt z31.h, p4/M, z30.s\n" + ".inst 0x648ab39d // bfcvtnt z29.h, p4/M, z28.s\n" + "st1h { z31.h }, p4, [x20]\n" + ".inst 0x648ab35b // bfcvtnt z27.h, p4/M, z26.s\n" + ".inst 0x648ab319 // bfcvtnt z25.h, p4/M, z24.s\n" + "st1h { z29.h }, p4, [x20, #1, MUL VL]\n" + ".inst 0x648ab2d7 // bfcvtnt z23.h, p4/M, z22.s\n" + ".inst 0x648ab295 // bfcvtnt z21.h, p4/M, z20.s\n" + "st1h { z27.h }, p4, [x20, #2, MUL VL]\n" + ".inst 0x648ab253 // bfcvtnt z19.h, p4/M, z18.s\n" + ".inst 0x648ab211 // bfcvtnt z17.h, p4/M, z16.s\n" + "st1h { z25.h }, p4, [x20, #3, MUL VL]\n" + "st1h { z23.h }, p4, [x20, #4, MUL VL]\n" + "st1h { z21.h }, p4, [x20, #5, MUL VL]\n" + "st1h { z19.h }, p4, [x20, #6, MUL VL]\n" + "st1h { z17.h }, p4, [x20, #7, MUL VL]\n" + "add x20, x20, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #8\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace +template<> +void Transform<8, 4, true, VLType::SVE>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_8VL_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp index a3216c494f..02367bd7e7 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -58,7 +58,7 @@ struct TransposeInterleaveCommon { } } - static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { + static void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { const auto ldin = stride; TOut *outarray = out; diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index e648ce2fb5..4ba03da6e7 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,6 +36,29 @@ namespace arm_gemm { +template +std::string get_type_name() { +#ifdef __GNUC__ + std::string s = __PRETTY_FUNCTION__; + + auto start = s.find("cls_"); + + if (start==std::string::npos) { + return "(unknown)"; + } + + for(size_t x = start+4; x inline T iceildiv(const T a, const T b) { return (a + b - 1) / b; diff --git a/src/core/cpu/kernels/assembly/arm_gemm.hpp b/src/core/cpu/kernels/assembly/arm_gemm.hpp index 81e355d6b3..e38cc09202 100644 --- a/src/core/cpu/kernels/assembly/arm_gemm.hpp +++ b/src/core/cpu/kernels/assembly/arm_gemm.hpp @@ -44,9 +44,7 @@ enum class GemmMethod GEMM_INTERLEAVED_2D, QUANTIZE_WRAPPER, QUANTIZE_WRAPPER_2D, - GEMM_HYBRID_QUANTIZED, - INDIRECT_GEMM, - CONVOLUTION_GEMM + GEMM_HYBRID_QUANTIZED }; struct KernelDescription @@ -113,13 +111,15 @@ public: bool _indirect_input; Activation _act; int _maxthreads; + bool _fast_mode; const GemmConfig *_cfg; GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N, unsigned int K, unsigned int Ksections, unsigned int nbatches, unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads, - const GemmConfig *cfg = nullptr) - : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg) + bool fast_mode = false, const GemmConfig *cfg = nullptr) + : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _fast_mode(fast_mode), + _cfg(cfg) { } }; diff --git a/src/core/cpu/kernels/assembly/gemm_common.hpp b/src/core/cpu/kernels/assembly/gemm_common.hpp index 4af85ed663..378f1041be 100644 --- a/src/core/cpu/kernels/assembly/gemm_common.hpp +++ b/src/core/cpu/kernels/assembly/gemm_common.hpp @@ -30,6 +30,9 @@ namespace arm_gemm { +// Avoid circular dependency with arm_gemm.hpp +struct GemmConfig; + // Abstract class for the GEMM/GEMV functions. // // GEMM implementations may be "native" (never require any input @@ -137,6 +140,10 @@ public: { } + /*** Introspection interface ***/ + /* Get the configuration of this GEMM */ + virtual GemmConfig get_config() = 0; + // Destructor virtual ~IGemmCommon() { diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index bc9a3056e8..0647a473e2 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -101,6 +101,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I fc_info.retain_internal_weights, // retain_internal_weights gemmlowp_output_stage, // gemmlowp_output_stage fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math true, // broadcast_bias ActivationLayerInfo()); // activation_info @@ -151,6 +152,7 @@ void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context fc_info.retain_internal_weights, // retain_internal_weights gemmlowp_output_stage, // gemmlowp_output_stage fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math true, // broadcast_bias fc_info.activation_info, // activation_info fc_info.constant_weights); // constant_weights diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index f926b1d0a6..16735dde0e 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -128,6 +128,7 @@ void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_contex false, // retain_internal_weights gemmlowp_output_stage, // gemmlowp_output_stage false, // fp_mixed_precision + false, // fast_math true, // broadcast_bias act_info); // activation_info @@ -167,6 +168,7 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens false, // retain_internal_weights gemmlowp_output_stage, // gemmlowp_output_stage false, // fp_mixed_precision + false, // fast_math true, // broadcast_bias act_info); // activation_info diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 1022e397d0..e88bd3b5d4 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -67,7 +67,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const case ConvolutionMethod::GEMM: { auto f = std::make_unique(_memory_manager); - f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info); + f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math); _function = std::move(f); break; } diff --git a/src/runtime/cpu/operators/CpuGemm.cpp b/src/runtime/cpu/operators/CpuGemm.cpp index 9a4d171ce6..c6abe1f893 100644 --- a/src/runtime/cpu/operators/CpuGemm.cpp +++ b/src/runtime/cpu/operators/CpuGemm.cpp @@ -48,6 +48,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); asm_info.activation_info = info.activation_info(); + asm_info.fast_mode = info.fast_math(); return asm_info; } diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp index a0424b1c63..fcdf8aa8f6 100644 --- a/src/runtime/cpu/operators/CpuGemmConvolution.cpp +++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp @@ -66,7 +66,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo // Create GEMMInfo structure const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); + false, GEMMLowpOutputStageInfo(), false, false, false, act_info); // Supported activations in GEMM const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, @@ -115,7 +115,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp = std::make_unique(); - _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info)); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, false, act_info)); auto mm_mem_req = _mm_gemmlowp->workspace(); for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) @@ -146,7 +146,7 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo // Create GEMMInfo structure const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); + false, GEMMLowpOutputStageInfo(), false, false, false, act_info); if(is_quantized) { @@ -186,7 +186,8 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo std::unique_ptr weights_qa = weights->clone(); input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info)); + return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, false, + act_info)); } else { diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp index c2e9f24ff6..10eece99eb 100644 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp @@ -86,6 +86,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect asm_info.padding_left = info.conv_info.pad_left(); asm_info.padding_value = 0.f; asm_info.negated_offsets = false; + asm_info.fast_mode = info.enable_fast_math; return asm_info; } } // namespace diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index 651ce436a0..56eb4fbb87 100644 --- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -63,6 +63,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); asm_info.activation_info = info.activation_info(); asm_info.output_stage = info.gemmlowp_output_stage(); + asm_info.fast_mode = info.fast_math(); return asm_info; } diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 79ea1cb5a7..bbbd5ac458 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -542,7 +542,7 @@ void create_arm_gemm(std::unique_ptr &arm_ge const CPUInfo &ci = NEScheduler::get().cpu_info(); unsigned int num_threads = NEScheduler::get().num_threads(); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode); // Create arm_gemm fallback auto fallback = std::make_unique>(); @@ -556,11 +556,11 @@ void create_arm_gemm_quant(std::unique_ptr & arm_gemm::Activation activation, const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(activation); - Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode); // Create arm_gemm fallback auto fallback = std::make_unique>(); diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h index 355273adeb..88cfed002a 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -51,6 +51,7 @@ struct AsmGemmInfo int64_t padding_top{ 0 }; int64_t padding_left{ 0 }; float padding_value{ 0.f }; + bool fast_mode{ false }; }; /** Assembly kernel glue */ diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h index c118da66ae..5f5fa3b653 100644 --- a/tests/validation/fixtures/GEMMFixture.h +++ b/tests/validation/fixtures/GEMMFixture.h @@ -98,7 +98,7 @@ protected: (disable_c) ? nullptr : &c, &dst, alpha, beta, - GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, (reinterpret_input_as_3d + GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, false, (reinterpret_input_as_3d || reinterpret_output_as_3d))); ARM_COMPUTE_ASSERT(a.info()->is_resizable()); ARM_COMPUTE_ASSERT(b.info()->is_resizable()); -- cgit v1.2.1