From c0b6f76561580414f08633a804fc548ccad65659 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 2 Nov 2020 01:37:17 +0000 Subject: COMPMID-3776: Indirect GEMM Signed-off-by: Georgios Pinitas Change-Id: I51a1b0f098bc3a8c408c50c92221e4df3061e12c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4343 Tested-by: Arm Jenkins Reviewed-by: Sang-Hoon Park Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- Android.bp | 129 +- arm_compute/core/Types.h | 9 +- arm_compute/runtime/FunctionDescriptors.h | 24 + arm_compute/runtime/NEON/NEFunctions.h | 2 +- .../runtime/NEON/functions/NEConvolutionLayer.h | 9 +- .../NEON/functions/NEGEMMAssemblyDispatch.h | 58 +- arm_compute/runtime/NEON/functions/NEGEMMConv2d.h | 108 + .../NEGEMMLowpAssemblyMatrixMultiplyCore.h | 74 - docs/06_functions_list.dox | 3 +- src/core/NEON/NEKernels.h | 1 - src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h | 89 - src/core/NEON/kernels/arm_gemm/convolver.hpp | 182 + src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp | 81 +- src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp | 46 +- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 120 +- src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 52 +- .../NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 621 + .../kernels/arm_gemm/gemm_hybrid_quantized.hpp | 15 +- .../arm_gemm/gemm_hybrid_quantized_inline.hpp | 265 + .../NEON/kernels/arm_gemm/gemm_implementation.hpp | 28 +- src/core/NEON/kernels/arm_gemm/gemm_int16.cpp | 6 +- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 107 +- .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 896 +- .../arm_gemm/gemm_interleaved_pretransposed_2d.hpp | 3 +- src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp | 143 +- src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp | 126 +- src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp | 6 +- src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 106 +- .../NEON/kernels/arm_gemm/gemv_pretransposed.hpp | 86 +- .../a32_interleave6_block1_fp32_fp32.hpp | 151 + .../a64_interleave4_block16_s8_s8.hpp | 193 + .../a64_interleave4_block16_s8_s8_summing.hpp | 225 + .../a64_interleave4_block16_u8_u8_summing.hpp | 225 + .../a64_interleave8_block1_bf16_fp32.hpp | 213 + .../a64_interleave8_block1_fp16_fp16.hpp | 270 + .../a64_interleave8_block1_fp16_fp32.hpp | 212 + .../a64_interleave8_block1_fp32_fp32.hpp | 196 + .../a64_interleave8_block1_s16_s16.hpp | 282 + .../a64_interleave8_block1_s16_s16_summing.hpp | 306 + .../a64_interleave8_block1_s8_s16.hpp | 286 + .../a64_interleave8_block1_s8_s16_summing.hpp | 322 + .../a64_interleave8_block1_u16_u16_summing.hpp | 306 + .../a64_interleave8_block1_u8_u16.hpp | 286 + .../a64_interleave8_block1_u8_u16_summing.hpp | 322 + .../a64_interleave8_block2_bf16_bf16.hpp | 247 + .../a64_interleave8_block2_fp32_fp32.hpp | 181 + .../a64_interleave8_block4_bf16_bf16.hpp | 223 + .../a64_interleave8_block4_s8_s8.hpp | 343 + .../a64_interleave8_block4_s8_s8_summing.hpp | 370 + .../a64_interleave8_block4_u8_u8_summing.hpp | 370 + .../a64_interleave8_block8_s8_s8.hpp | 319 + .../a64_interleave8_block8_s8_s8_summing.hpp | 362 + .../a64_interleave8_block8_u8_u8_summing.hpp | 362 + .../kernels/arm_gemm/indirect-interleaves/list.hpp | 48 + .../NEON/kernels/arm_gemm/interleave_indirect.cpp | 409 + .../NEON/kernels/arm_gemm/interleave_indirect.hpp | 43 + .../kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp | 73 - .../arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp | 323 - .../kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp | 74 + .../arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp | 323 + .../kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp | 77 - .../arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp | 389 - .../arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp | 350 - .../arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp | 347 - .../kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp | 5 +- .../kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp | 78 + .../arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp | 389 + .../arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp | 350 + .../arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp | 347 + .../kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp | 73 - .../arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp | 323 - .../kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp | 66 + .../arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp | 323 + .../kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp | 85 - .../arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp | 388 - .../arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp | 350 - .../arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp | 348 - .../kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp | 5 +- .../kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp | 86 + .../arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp | 388 + .../arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp | 350 + .../arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp | 348 + .../arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp | 82 + .../kernels/a64_gemv_fp32_mla_32/generic.cpp | 1546 ++ .../kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp | 80 - .../arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp | 398 - .../arm_gemm/kernels/a64_hgemm_24x8/generic.cpp | 353 - .../kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp | 350 - .../kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp | 80 + .../arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp | 398 + .../arm_gemm/kernels/a64_hgemm_8x24/generic.cpp | 353 + .../kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp | 350 + .../kernels/a64_hybrid_bf16fp32_dot_6x16.hpp | 86 + .../a64_hybrid_bf16fp32_dot_6x16/generic.cpp | 3668 ++++ .../arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp | 85 + .../kernels/a64_hybrid_fp16_mla_6x32/generic.cpp | 5400 ++++++ .../arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp | 111 - .../kernels/a64_hybrid_fp32_mla_16x4/a55.cpp | 2427 --- .../kernels/a64_hybrid_fp32_mla_16x4/generic.cpp | 1802 -- .../kernels/a64_hybrid_fp32_mla_16x4/x1.cpp | 1810 -- .../arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp | 89 - .../kernels/a64_hybrid_fp32_mla_4x8/generic.cpp | 1934 -- .../arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp | 102 + .../kernels/a64_hybrid_fp32_mla_6x16/generic.cpp | 3430 ++++ .../arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp | 85 + .../kernels/a64_hybrid_fp32_mla_8x4/generic.cpp | 2195 +++ .../arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp | 85 + .../kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp | 2072 ++ .../arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp | 85 + .../kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp | 3613 ++++ .../arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp | 92 - .../kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp | 2434 --- .../kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp | 1808 -- .../arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp | 85 + .../kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp | 3335 ++++ .../arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp | 85 + .../kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp | 2072 ++ .../arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp | 92 - .../kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp | 2434 --- .../kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp | 1808 -- .../arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp | 85 + .../kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp | 3335 ++++ .../kernels/a64_interleaved_bf16fp32_dot_12x8.hpp | 75 - .../a64_interleaved_bf16fp32_dot_12x8/generic.cpp | 327 - .../a64_interleaved_bf16fp32_dot_12x8/x1.cpp | 328 - .../kernels/a64_interleaved_bf16fp32_dot_8x12.hpp | 72 + .../a64_interleaved_bf16fp32_dot_8x12/generic.cpp | 327 + .../kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp | 72 - .../a64_interleaved_bf16fp32_mmla_12x8/generic.cpp | 418 - .../kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp | 72 + .../a64_interleaved_bf16fp32_mmla_8x12/generic.cpp | 428 + .../kernels/a64_interleaved_s8s32_mmla_12x8.hpp | 72 - .../a64_interleaved_s8s32_mmla_12x8/generic.cpp | 395 - .../kernels/a64_interleaved_s8s32_mmla_8x12.hpp | 73 + .../a64_interleaved_s8s32_mmla_8x12/generic.cpp | 395 + .../kernels/a64_interleaved_u8u32_mmla_12x8.hpp | 72 - .../a64_interleaved_u8u32_mmla_12x8/generic.cpp | 395 - .../kernels/a64_interleaved_u8u32_mmla_8x12.hpp | 73 + .../a64_interleaved_u8u32_mmla_8x12/generic.cpp | 395 + .../kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp | 116 - .../arm_gemm/kernels/a64_sgemm_12x8/a53.cpp | 377 - .../arm_gemm/kernels/a64_sgemm_12x8/a55.cpp | 378 - .../arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp | 386 - .../arm_gemm/kernels/a64_sgemm_12x8/generic.cpp | 356 - .../kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp | 354 - .../kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp | 116 + .../arm_gemm/kernels/a64_sgemm_8x12/a53.cpp | 377 + .../arm_gemm/kernels/a64_sgemm_8x12/a55.cpp | 378 + .../arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp | 386 + .../arm_gemm/kernels/a64_sgemm_8x12/generic.cpp | 356 + .../kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp | 354 + .../kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp | 86 - .../a64_smallK_hybrid_fp32_mla_4x6/generic.cpp | 4612 ----- .../kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp | 86 - .../a64_smallK_hybrid_fp32_mla_4x8/generic.cpp | 3340 ---- .../kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp | 88 + .../a64_smallK_hybrid_fp32_mla_6x4/generic.cpp | 4612 +++++ .../kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp | 88 + .../a64_smallK_hybrid_fp32_mla_8x4/generic.cpp | 3340 ++++ .../kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp | 91 - .../a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp | 4130 ---- .../a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp | 3786 ---- .../kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp | 91 - .../a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp | 3088 --- .../a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp | 2880 --- .../kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp | 91 + .../a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp | 4854 +++++ .../a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp | 4590 +++++ .../kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp | 91 + .../a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp | 3352 ++++ .../a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp | 3216 ++++ .../kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp | 91 - .../a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp | 4130 ---- .../a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp | 3786 ---- .../kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp | 91 - .../a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp | 3088 --- .../a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp | 2880 --- .../kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp | 91 + .../a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp | 4854 +++++ .../a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp | 4590 +++++ .../kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp | 91 + .../a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp | 3352 ++++ .../a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp | 3216 ++++ .../arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp | 82 + .../kernels/sve_gemv_fp32_mla_8VL/generic.cpp | 1372 ++ .../kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp | 89 - .../sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp | 2247 --- .../kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp | 86 + .../sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp | 2237 +++ .../kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp | 89 - .../sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp | 3459 ---- .../kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp | 89 - .../sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp | 1633 -- .../kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp | 89 - .../sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp | 2001 -- .../arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp | 89 - .../kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp | 3778 ---- .../arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp | 85 + .../kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp | 3178 ++++ .../arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp | 89 - .../kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp | 2118 --- .../arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp | 85 + .../kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp | 2236 +++ .../arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp | 85 + .../kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp | 1751 ++ .../kernels/sve_hybrid_fp32_mmla_4VLx4.hpp | 89 - .../kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp | 3459 ---- .../arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp | 85 + .../kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp | 1602 ++ .../arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp | 85 + .../kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp | 2770 +++ .../kernels/sve_hybrid_s8s32_dot_4VLx4.hpp | 89 - .../kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp | 2137 --- .../kernels/sve_hybrid_s8s32_dot_6x4VL.hpp | 85 + .../kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp | 1904 ++ .../arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp | 85 + .../kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp | 1602 ++ .../kernels/sve_hybrid_u8u32_dot_4VLx4.hpp | 89 - .../kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp | 2137 --- .../kernels/sve_hybrid_u8u32_dot_6x4VL.hpp | 85 + .../kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp | 1904 ++ .../kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp | 72 - .../sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp | 329 - .../kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp | 72 + .../sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp | 329 + .../sve_interleaved_bf16fp32_mmla_3VLx8.hpp | 72 - .../generic.cpp | 397 - .../sve_interleaved_bf16fp32_mmla_8x3VL.hpp | 72 + .../generic.cpp | 397 + .../kernels/sve_interleaved_fp16_mla_3VLx8.hpp | 72 - .../sve_interleaved_fp16_mla_3VLx8/generic.cpp | 319 - .../kernels/sve_interleaved_fp16_mla_8x3VL.hpp | 72 + .../sve_interleaved_fp16_mla_8x3VL/generic.cpp | 319 + .../kernels/sve_interleaved_fp32_mla_3VLx8.hpp | 72 - .../sve_interleaved_fp32_mla_3VLx8/generic.cpp | 328 - .../kernels/sve_interleaved_fp32_mla_8x3VL.hpp | 72 + .../sve_interleaved_fp32_mla_8x3VL/generic.cpp | 328 + .../kernels/sve_interleaved_fp32_mmla_3VLx8.hpp | 72 - .../sve_interleaved_fp32_mmla_3VLx8/generic.cpp | 397 - .../kernels/sve_interleaved_fp32_mmla_8x3VL.hpp | 72 + .../sve_interleaved_fp32_mmla_8x3VL/generic.cpp | 397 + .../kernels/sve_interleaved_s8s32_dot_3VLx8.hpp | 72 - .../sve_interleaved_s8s32_dot_3VLx8/generic.cpp | 329 - .../kernels/sve_interleaved_s8s32_dot_8x3VL.hpp | 73 + .../sve_interleaved_s8s32_dot_8x3VL/generic.cpp | 329 + .../kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp | 72 - .../sve_interleaved_s8s32_mmla_3VLx8/generic.cpp | 397 - .../kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp | 73 + .../sve_interleaved_s8s32_mmla_8x3VL/generic.cpp | 397 + .../kernels/sve_interleaved_u8u32_dot_3VLx8.hpp | 72 - .../sve_interleaved_u8u32_dot_3VLx8/generic.cpp | 329 - .../kernels/sve_interleaved_u8u32_dot_8x3VL.hpp | 73 + .../sve_interleaved_u8u32_dot_8x3VL/generic.cpp | 329 + .../kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp | 72 - .../sve_interleaved_u8u32_mmla_3VLx8/generic.cpp | 397 - .../kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp | 73 + .../sve_interleaved_u8u32_mmla_8x3VL/generic.cpp | 397 + .../kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp | 88 - .../sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp | 18807 ------------------- .../kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp | 88 + .../sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp | 18807 +++++++++++++++++++ .../kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp | 88 - .../sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp | 7503 -------- .../kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp | 88 + .../sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp | 8971 +++++++++ .../kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp | 88 - .../sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp | 7503 -------- .../kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp | 88 + .../sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp | 8971 +++++++++ .../NEON/kernels/arm_gemm/quantize_wrapper.hpp | 2 +- src/core/NEON/kernels/arm_gemm/quantized.cpp | 173 + src/core/NEON/kernels/arm_gemm/quantized.hpp | 6 + .../NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp | 1160 ++ .../NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp | 1160 ++ .../NEON/kernels/arm_gemm/std_transforms_fixed.hpp | 20 +- .../NEON/kernels/arm_gemm/std_transforms_sve.hpp | 21 +- src/core/NEON/kernels/arm_gemm/transform.hpp | 8 +- .../transforms/a32_interleave_6way_32bit.hpp | 167 - .../a32_transpose_interleave_8way_32bit.hpp | 10 +- .../transforms/a64_block16_interleave4_8bit.hpp | 128 - .../transforms/a64_interleave_8way_16bit.hpp | 182 - .../transforms/a64_interleave_8way_32bit.hpp | 191 - .../transforms/a64_interleave_8way_block4_8bit.hpp | 228 - .../a64_interleave_8way_half_to_float.hpp | 207 - .../transforms/a64_interleave_8way_s8_to_s16.hpp | 224 - .../transforms/a64_interleave_8way_u8_to_u16.hpp | 224 - .../a64_transpose_interleave_12way_16bit.hpp | 8 +- ...64_transpose_interleave_12way_half_to_float.hpp | 2 +- .../a64_transpose_interleave_24way_16bit.hpp | 8 +- .../a64_transpose_interleave_8way_32bit.hpp | 8 +- src/core/NEON/kernels/arm_gemm/transforms/list.hpp | 14 - .../transforms/sve_interleave_8way_32bit.hpp | 596 - .../sve_interleave_8way_block2_16bit.hpp | 596 - .../sve_interleave_8way_block2_32bit.hpp | 596 - .../sve_interleave_8way_block4_16bit.hpp | 596 - .../transforms/sve_interleave_8way_block4_8bit.hpp | 596 - .../transforms/sve_interleave_8way_block8_8bit.hpp | 596 - .../transforms/transpose_interleave_common.hpp | 2 + src/core/NEON/kernels/arm_gemm/utils.hpp | 93 +- .../NEON/kernels/assembly/INEGEMMWrapperKernel.cpp | 89 - .../NEON/kernels/assembly/INEGEMMWrapperKernel.h | 108 - src/core/NEON/kernels/assembly/arm_gemm.hpp | 18 +- .../kernels/assembly/convolution_parameters.hpp | 65 + src/core/NEON/kernels/assembly/gemm_common.hpp | 26 +- src/runtime/NEON/functions/NEConvolutionLayer.cpp | 61 +- src/runtime/NEON/functions/NEGEMM.cpp | 26 +- .../NEON/functions/NEGEMMAssemblyDispatch.cpp | 328 +- src/runtime/NEON/functions/NEGEMMConv2d.cpp | 167 + .../NEGEMMLowpAssemblyMatrixMultiplyCore.cpp | 142 - .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 28 +- .../NEON/functions/NESimpleAssemblyFunction.cpp | 46 - .../NEON/functions/NESimpleAssemblyFunction.h | 56 - tests/validation/NEON/ConvolutionLayer.cpp | 97 +- tests/validation/NEON/GEMMLowp.cpp | 23 - .../validation/fixtures/ConvolutionLayerFixture.h | 16 +- 315 files changed, 155454 insertions(+), 129826 deletions(-) create mode 100644 arm_compute/runtime/NEON/functions/NEGEMMConv2d.h delete mode 100644 arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h delete mode 100644 src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h create mode 100644 src/core/NEON/kernels/arm_gemm/convolver.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp delete mode 100644 src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp delete mode 100644 src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h create mode 100644 src/core/NEON/kernels/assembly/convolution_parameters.hpp create mode 100644 src/runtime/NEON/functions/NEGEMMConv2d.cpp delete mode 100644 src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp delete mode 100644 src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp delete mode 100644 src/runtime/NEON/functions/NESimpleAssemblyFunction.h diff --git a/Android.bp b/Android.bp index 8d931c23c8..98b00cf5ba 100644 --- a/Android.bp +++ b/Android.bp @@ -367,10 +367,12 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp", "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp", + "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults.cpp", "src/core/NEON/kernels/arm_gemm/misc.cpp", "src/core/NEON/kernels/arm_gemm/quantized.cpp", - "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp", + "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp", + "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp", "src/core/NEON/kernels/convolution/common/padding.cpp", "src/core/NEON/kernels/convolution/common/qasymm8.cpp", "src/core/NEON/kernels/convolution/common/qsymm8.cpp", @@ -669,9 +671,9 @@ cc_library_static { "src/runtime/NEON/functions/NEFuseBatchNormalization.cpp", "src/runtime/NEON/functions/NEGEMM.cpp", "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp", + "src/runtime/NEON/functions/NEGEMMConv2d.cpp", "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp", "src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp", - "src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp", "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp", "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp", "src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp", @@ -727,7 +729,6 @@ cc_library_static { "src/runtime/NEON/functions/NEScale.cpp", "src/runtime/NEON/functions/NEScharr3x3.cpp", "src/runtime/NEON/functions/NESelect.cpp", - "src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp", "src/runtime/NEON/functions/NESlice.cpp", "src/runtime/NEON/functions/NESobel3x3.cpp", "src/runtime/NEON/functions/NESobel5x5.cpp", @@ -779,69 +780,71 @@ cc_library_static { }, arm64: { srcs: [ - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp", ], }, diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 306bdc6706..2e639c4be4 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -137,10 +137,11 @@ enum class DataLayoutDimension /** Available ConvolutionMethod*/ enum class ConvolutionMethod { - GEMM, /**< Convolution using GEMM */ - DIRECT, /**< Direct convolution */ - WINOGRAD, /**< Convolution using Winograd */ - FFT /**< Convolution using FFT */ + GEMM, /**< Convolution using GEMM */ + GEMM_CONV2D, /**< Direct 2D GEMM convolution */ + DIRECT, /**< Direct convolution */ + WINOGRAD, /**< Convolution using Winograd */ + FFT /**< Convolution using FFT */ }; /** Available DepthwiseConvolutionFunction*/ diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h index 16d6c345e2..1f4216eb21 100644 --- a/arm_compute/runtime/FunctionDescriptors.h +++ b/arm_compute/runtime/FunctionDescriptors.h @@ -23,6 +23,9 @@ */ #ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H #define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H + +#include "arm_compute/core/Types.h" + #include namespace arm_compute @@ -48,5 +51,26 @@ struct FFT2DInfo unsigned int axis1{ 1 }; /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/ FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */ }; + +/** Descriptor used by the Convolution function */ +struct Conv2dInfo +{ + Conv2dInfo() = default; + + Conv2dInfo(const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) + : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups) + { + } + + PadStrideInfo conv_info{}; + Size2D dilation{ 1U, 1U }; + ActivationLayerInfo act_info{}; + bool enable_fast_math{ false }; + unsigned int num_groups{ 1 }; +}; } // namespace arm_compute #endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */ diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h index a97fa3b81a..e7d59e1608 100644 --- a/arm_compute/runtime/NEON/NEFunctions.h +++ b/arm_compute/runtime/NEON/NEFunctions.h @@ -78,9 +78,9 @@ #include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h" diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index 54dae57752..a061dc7b04 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -26,16 +26,15 @@ #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" + #include namespace arm_compute { +// Forward declarations class ITensor; /** Basic function to simulate a convolution layer. This function calls one of the following NEON functions: @@ -158,5 +157,5 @@ private: std::shared_ptr _memory_manager; std::unique_ptr _function; /**< Function to run */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */ \ No newline at end of file diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h index ac77acf69d..8f9498d0f5 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h @@ -32,6 +32,28 @@ namespace arm_compute { +/* Convolution method supported by the assembly gemm interface */ +enum class AsmConvMethod +{ + Im2Col, + Indirect, + Conv +}; + +struct AsmGemmInfo +{ + AsmConvMethod method{ AsmConvMethod::Im2Col }; + PadStrideInfo ps_info{}; + ActivationLayerInfo activation_info{}; + GEMMLowpOutputStageInfo output_stage{}; + bool negated_offsets{ true }; + bool reinterpret_input_as_3d{ false }; + bool depth_output_gemm3d{ false }; + int64_t padding_top{ 0 }; + int64_t padding_left{ 0 }; + float padding_value{ 0.f }; +}; + /** Assembly kernel glue */ class NEGEMMAssemblyDispatch : public IFunction { @@ -55,33 +77,28 @@ public: virtual ~IFallback() = default; }; -private: - /** Interface for the arm_gemm fallback */ - std::unique_ptr _arm_gemm; - MemoryGroup _memory_group; /**< Function memory group */ - IWeightsManager *_weights_manager; /**< Pointer to the weights manager */ public: /** If supported create a Compute Library function else fallback to the arm_gemm function. * - * @param[in] a Input tensor (Matrix A) - * @param[in] b Input tensor (Matrix B) - * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations - * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] gemm_info GEMM meta-data + * @param[in] a Input tensor (Matrix A) + * @param[in] b Input tensor (Matrix B) + * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations + * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] info GEMM meta-data */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info); + void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info); /** Indicates whether or not this function can be used to process the given parameters. * - * @param[in] a Input tensor info (Matrix A) - * @param[in] b Input tensor info (Matrix B) - * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations - * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] gemm_info GEMM meta-data + * @param[in] a Input tensor info (Matrix A) + * @param[in] b Input tensor info (Matrix B) + * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations + * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] info GEMM meta-data * * @return a status. */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info); + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); /** Checks if activation is supported by the gemm assembly dispatcher * * @param[in] activation Activation to check @@ -94,10 +111,15 @@ public: * @return True if the function is configured and ready to run */ bool is_configured() const; + // Inherited methods overridden: - /** Runs a preparation step, usually for pre-transposing matrix b */ void prepare() override; void run() override; + +private: + std::unique_ptr _arm_gemm; /** Interface for the arm_gemm fallback */ + MemoryGroup _memory_group; /**< Function memory group */ + IWeightsManager *_weights_manager; /**< Pointer to the weights manager */ }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h new file mode 100644 index 0000000000..7cae39397f --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEGEMMCONV2D_H +#define ARM_COMPUTE_NEGEMMCONV2D_H + +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/Tensor.h" + +#include +namespace arm_compute +{ +// Forward declarations +class ITensor; +/** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions: + * + * Supports only NHWC data layout + * + * -# @ref NEGEMMAssemblyDispatch + * -# @ref NEActivationLayer, in case activation cannot be fused in the assembly dispatch + * + * Weights are transformed from OHWI to HWIO format using the following kernels: + * -# @ref NEPermute + */ +class NEGEMMConv2d : public IFunction +{ +public: + /** Constructor */ + NEGEMMConv2d(const std::shared_ptr &memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMConv2d(const NEGEMMConv2d &) = delete; + /** Default move constructor */ + NEGEMMConv2d(NEGEMMConv2d &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMConv2d &operator=(const NEGEMMConv2d &) = delete; + /** Default move assignment operator */ + NEGEMMConv2d &operator=(NEGEMMConv2d &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] info Convolution layer descriptor + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d + * + * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + NEGEMMAssemblyDispatch _gemm_asm_func; + NEActivationLayer _activation_func; + NEPermute _weights_permute_func; + const ITensor *_original_weights; + Tensor _permuted_weights; + bool _is_prepared; + bool _run_activation; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_NEGEMMCONV2D_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h deleted file mode 100644 index 961b1901e7..0000000000 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H -#define ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H - -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" -#include "arm_compute/runtime/Tensor.h" - -#include - -namespace arm_compute -{ -// Forward declarations -class ITensor; -class NEGEMMInterleave4x4Kernel; -class NEGEMMTranspose1xWKernel; -class NEGEMMLowpMatrixMultiplyKernel; - -/** Basic function to execute matrix multiply assembly kernels. */ -class NEGEMMLowpAssemblyMatrixMultiplyCore : public IFunction -{ -public: - /** Constructor */ - NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr memory_manager = nullptr); - /** Destructor */ - ~NEGEMMLowpAssemblyMatrixMultiplyCore(); - - /** Initialise the kernel's inputs, output - * - * @param[in] a First input tensor (Matrix A). Data type supported: U8, S8. - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a - * @param[in] c Third input tensor (Matrix C). Data type supported: same as @p a - * @param[out] output Output tensor. Data type supported: Data type supported: U32, S32 - */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output); - - // Inherited methods overridden: - void run() override; - -private: - MemoryGroup _memory_group; - NEGEMMAssemblyDispatch _asm_glue; - std::unique_ptr _mm_kernel; - std::unique_ptr _mtx_a_reshape_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - Tensor _tmp_a; - Tensor _tmp_b; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H */ diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox index ac944610dc..e6924211e2 100644 --- a/docs/06_functions_list.dox +++ b/docs/06_functions_list.dox @@ -141,8 +141,8 @@ namespace arm_compute - @ref NEGaussianPyramidOrb - @ref NEGEMM - @ref NEGEMMAssemblyDispatch + - @ref NEGEMMConv2d - @ref NEGEMMConvolutionLayer - - @ref NEGEMMLowpAssemblyMatrixMultiplyCore - @ref NEGEMMLowpMatrixMultiplyCore - @ref NEGenerateProposalsLayer - @ref NEHarrisCorners @@ -173,7 +173,6 @@ namespace arm_compute - @ref NERNNLayer - @ref NEROIPoolingLayer - @ref NEScale - - @ref NESimpleAssemblyFunction - @ref NESobel5x5 - @ref NESobel7x7 - @ref NESoftmaxLayerGeneric <IS_LOG> diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index 67562933d4..79c4bcea25 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -72,7 +72,6 @@ #include "src/core/NEON/kernels/NEFlattenLayerKernel.h" #include "src/core/NEON/kernels/NEFloorKernel.h" #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" -#include "src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" #include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" diff --git a/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h deleted file mode 100644 index 775a2c06ab..0000000000 --- a/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEGEMMASSEMBLYBASE_H -#define ARM_COMPUTE_NEGEMMASSEMBLYBASE_H - -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Base class for GEMM NEON kernels implemented in Assembly. */ -class NEGEMMAssemblyBaseKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEGEMMAssemblyBaseKernel"; - } - /** Constructor */ - NEGEMMAssemblyBaseKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false) - { - } - - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete; - /** Allow instances of this class to be moved */ - NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default; - /** Allow instances of this class to be moved */ - NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default; - - virtual ~NEGEMMAssemblyBaseKernel() = default; - - /** Initialise the kernel's input and output. - * - * The computed function is C = a * AxB + b * C. - * - * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32 - * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0 - * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0. - * @param[out] workspace Space for intermediate results. - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of the accumulation. - * @param[in] is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false) - * @param[in] is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false) - */ - void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false) - { - internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1); - } - -protected: - virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0; - - const ITensor *_input0; - const ITensor *_input1; - ITensor *_output; - ITensor *_workspace; - float _alpha; - float _beta; - bool _is_transposed_0; - bool _is_transposed_1; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NEGEMMASSEMBLYBASE_H*/ diff --git a/src/core/NEON/kernels/arm_gemm/convolver.hpp b/src/core/NEON/kernels/arm_gemm/convolver.hpp new file mode 100644 index 0000000000..1cd959523f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/convolver.hpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "convolution_parameters.hpp" + +#include +#include +#include +#include + +namespace arm_gemm { + +// Class to assist with convolution calculations. +// +// This is framed as a hierarchy of objects: +// +// - Top level object which depends only on convolution parameters. This sets up std::vectors for the padding and +// kernel offset arrays. From this you can request: +// +// - Mid level object (e.g. instantiated at start of 'ConvolutionInterleave'). This holds specifics about the +// input tensor, and the desired column range. Calculations specific to this can be done once when this is set +// up. From this you can request: +// +// - Low level object (instantiated for each range of rows). This contains methods to actually populate a row +// pointer array. + + +template +class convolver { +private: + const ConvolutionParameters m_params; + + // Vector of padding data + const std::vector m_pad_row; + + // X/Y offsets for each kernel position + std::vector m_kernel_y; + std::vector m_kernel_x; + + class column_handler { + private: + const convolver &m_parent; + + // Base/stride of input image + const T * const m_input_base; + const size_t m_input_stride; + + // Starting kernel point and channel offset within that point + const unsigned int m_start_pos; + const unsigned int m_start_offset; + + // Total length to process, rounded length of each input channel block. + const unsigned int m_length; + const unsigned int m_rounded_stringlen; + + class row_handler { + private: + const convolver &m_convolver; + const column_handler &m_parent; + + // These variables track progress through the current block of rows + unsigned int m_start_output_y=0; + unsigned int m_start_output_x=0; + + unsigned int m_length_remaining=0; + unsigned int m_current_pos=0; + + unsigned int m_active_height=0; + + public: + row_handler(const column_handler &parent, unsigned int start_row, unsigned int active_height) : + m_convolver(parent.m_parent), + m_parent(parent), + m_start_output_y(start_row / m_convolver.m_params.output_width), + m_start_output_x(start_row % m_convolver.m_params.output_width), + m_length_remaining(m_parent.m_length), + m_current_pos(m_parent.m_start_pos), + m_active_height(active_height) { } + + bool finished() const { + return (m_length_remaining == 0); + } + + std::tuple next_block(const T ** const row_ptr) { + if (finished()) { + return { 0, 0 }; + } + + // "in_width" in the amount of data that will be read in (copied) + // "out_width" is the total amount of data that will be produced (including padding) + unsigned int offset = (m_current_pos == m_parent.m_start_pos) ? m_parent.m_start_offset : 0; + unsigned int in_width = std::min(m_length_remaining, static_cast(m_convolver.m_params.input_channels) - offset); + unsigned int out_width = std::min(m_length_remaining, m_parent.m_rounded_stringlen - offset); + + unsigned int output_y = m_start_output_y; + unsigned int output_x = m_start_output_x; + + for (unsigned int row=0; row= m_convolver.m_params.input_height || input_x < 0 || input_x >= m_convolver.m_params.input_width) { + row_ptr[row] = m_convolver.m_pad_row.data(); + } else { + row_ptr[row] = m_parent.m_input_base + ((input_y * m_convolver.m_params.input_width) + input_x) * m_parent.m_input_stride; + } + + output_x++; + if (output_x == m_convolver.m_params.output_width) { + output_y++; + output_x=0; + } + } + + m_current_pos++; + m_length_remaining-=out_width; + + return { in_width, offset }; + } + }; // end of "row handler" class + + public: + column_handler(const convolver &parent, const T *input_base, size_t input_stride, + unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen) + : m_parent(parent), m_input_base(input_base), m_input_stride(input_stride), + m_start_pos(k_start / rounded_stringlen), + m_start_offset(k_start % rounded_stringlen), + m_length(k_end - k_start), + m_rounded_stringlen(rounded_stringlen) { } + + row_handler process_rows(unsigned int start_row, unsigned int active_height) const { + return row_handler(*this, start_row, active_height); + } + }; // end of "column handler" class + +public: + convolver(ConvolutionParameters params) : + m_params (params), m_pad_row(params.input_channels, static_cast(params.padding_value)), + m_kernel_y(params.kernel_width * params.kernel_height, 0), + m_kernel_x(params.kernel_width * params.kernel_height, 0) { + + // Kernel points are addressed across, then down (assumed weight layout is WHIO) + for (unsigned int ky=0; ky gemm_bf16_methods[] = { #ifdef V8P6_BF -# ifdef __ARM_FEATURE_SVE -{ - GemmMethod::GEMM_HYBRID, - "hybrid_bf16fp32_mmla_6VLx2", - [](const GemmArgs &args) { return (args._Ksize>=8); }, - [](const GemmArgs &args) { return ((args._Msize <= 4) && (args._Nsize <= hybrid_bf16fp32_mmla_6VLx2::out_width())); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, -{ - GemmMethod::GEMM_HYBRID, - "hybrid_bf16fp32_mmla_8VLx2", - [](const GemmArgs &args) { return (args._Ksize>=8); }, - [](const GemmArgs &args) { return (args._Msize <= 4); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, -{ - GemmMethod::GEMM_HYBRID, - "hybrid_bf16fp32_mmla_4VLx4", - [](const GemmArgs &args) { return (args._Ksize>=8); }, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, -{ - GemmMethod::GEMM_HYBRID, - "hybrid_bf16fp32_dot_4VLx4", - [](const GemmArgs &args) { return (args._Ksize>=8); }, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, +#ifdef __ARM_FEATURE_SVE { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, - "interleaved_bf16fp32_mmla_3VLx8", + "sve_interleaved_bf16fp32_mmla_8x3VL", [](const GemmArgs &args) { return (args._Ksize>4); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::GEMM_HYBRID, + "sve_hybrid_bf16fp32_dot_6x4VL", + nullptr, + [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, - "interleaved_bf16fp32_dot_3VLx8", + "sve_interleaved_bf16fp32_dot_8x3VL", [](const GemmArgs &args) { return (args._Ksize>2); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, # endif // SVE { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, - "interleaved_bf16fp32_mmla_12x8", + "a64_interleaved_bf16fp32_mmla_8x12", [](const GemmArgs &args) { return (args._Ksize>4); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::GEMM_HYBRID, + "a64_hybrid_bf16fp32_dot_6x16", + nullptr, + nullptr, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, - "interleaved_bf16fp32_dot_12x8", + "a64_interleaved_bf16fp32_dot_8x12", [](const GemmArgs &args) { return (args._Ksize>2); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif // V8P6_BF #ifdef __aarch64__ { GemmMethod::GEMM_INTERLEAVED, - "sgemm_12x8", + "a64_sgemm_8x12", nullptr, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #elif defined(__arm__) { diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp index 91012218e5..de2e4f2c2b 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp @@ -29,15 +29,17 @@ #include "gemm_common.hpp" #include "gemm_hybrid.hpp" +#include "gemm_hybrid_indirect.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" #include "gemm_interleaved_pretransposed_2d.hpp" #include "kernels/a32_sgemm_8x6.hpp" -#include "kernels/a64_hgemm_24x8.hpp" -#include "kernels/a64_sgemm_12x8.hpp" -#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp" -#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp" +#include "kernels/a64_hgemm_8x24.hpp" +#include "kernels/a64_hybrid_fp16_mla_6x32.hpp" +#include "kernels/a64_sgemm_8x12.hpp" +#include "kernels/sve_hybrid_fp16_mla_6x4VL.hpp" +#include "kernels/sve_interleaved_fp16_mla_8x3VL.hpp" namespace arm_gemm { @@ -45,61 +47,51 @@ static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = { #if defined(__ARM_FEATURE_SVE) { GemmMethod::GEMM_HYBRID, - "hybrid_fp16_mla_4VLx4", - [](const GemmArgs &args) { return (args._Ksize >= 8); }, + "sve_hybrid_fp16_mla_6x4VL", + nullptr, [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "interleaved_fp16_mla_3VLx8", + "sve_interleaved_fp16_mla_8x3VL", [](const GemmArgs &args) { return (args._Ksize > 4); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif #if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)) { - GemmMethod::GEMM_INTERLEAVED_2D, - "hgemm_24x8_2d", + GemmMethod::GEMM_HYBRID, + "a64_hybrid_fp16_mla_6x32", #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC [](const GemmArgs &args) { return args._ci->has_fp16(); }, #else nullptr, #endif - [](const GemmArgs &args) { return args._maxthreads >= 8; }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } + [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "hgemm_24x8_1d", + "a64_hgemm_8x24", #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC [](const GemmArgs &args) { return args._ci->has_fp16(); }, #else nullptr, #endif nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, - #endif // aarch64 && FP16 #ifdef __aarch64__ -//Pretranpose, 2D split -{ - GemmMethod::GEMM_INTERLEAVED_2D, - "sgemm_12x8_2d", - nullptr, - [](const GemmArgs &args) { return args._maxthreads >= 8; }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } -}, -//Tranpose, 1D split, with blockmanager { GemmMethod::GEMM_INTERLEAVED, - "sgemm_12x8_1d", + "a64_sgemm_8x12", nullptr, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #elif defined(__arm__) { diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index ddb438f06c..e9e335f500 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -24,6 +24,7 @@ #include "arm_gemm.hpp" #include "gemm_common.hpp" #include "gemm_hybrid.hpp" +#include "gemm_hybrid_indirect.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" #include "gemm_interleaved_pretransposed_2d.hpp" @@ -31,127 +32,130 @@ #include "gemv_pretransposed.hpp" #include "kernels/a32_sgemm_8x6.hpp" -#include "kernels/a64_hybrid_fp32_mla_16x4.hpp" -#include "kernels/a64_hybrid_fp32_mla_4x8.hpp" -#include "kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp" -#include "kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp" -#include "kernels/a64_sgemm_12x8.hpp" -#include "kernels/a64_sgemv_pretransposed.hpp" +#include "kernels/a64_gemv_fp32_mla_32.hpp" +#include "kernels/a64_hybrid_fp32_mla_6x16.hpp" +#include "kernels/a64_hybrid_fp32_mla_8x4.hpp" +#include "kernels/a64_sgemm_8x12.hpp" +#include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp" +#include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp" -#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp" -#include "kernels/sve_hybrid_fp32_mmla_4VLx4.hpp" -#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp" -#include "kernels/sve_interleaved_fp32_mmla_3VLx8.hpp" -#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp" +#include "kernels/sve_gemv_fp32_mla_8VL.hpp" +#include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp" +#include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp" +#include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp" +#include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp" +#include "kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp" namespace arm_gemm { static const GemmImplementation gemm_fp32_methods[] = { +// GEMV cases - starting with 'gemv_batched' wrapper to turn batched GEMV into GEMM. { GemmMethod::GEMV_BATCHED, "gemv_batched", - [](const GemmArgs &args) { return (args._Msize==1) && (args._nbatches>1); }, + [](const GemmArgs &args) { return args._Msize==1 && args._nbatches>1 && !args._indirect_input; }, nullptr, [](const GemmArgs &args) { return new GemvBatched(args); } }, #ifdef __aarch64__ +#ifdef __ARM_FEATURE_SVE { - GemmMethod::GEMV_PRETRANSPOSED, - "sgemv_pretransposed", - [](const GemmArgs &args) { return (args._Msize==1 && args._nbatches==1); }, + GemmMethod::GEMM_HYBRID, + "sve_gemv_fp32_mla_8VL", + [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemvPretransposed(args); } + [](const GemmArgs &args) { return new GemvPretransposed(args); } }, -#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32) +#endif { GemmMethod::GEMM_HYBRID, - "hybrid_fp32_mmla_4VLx4", - [](const GemmArgs &args) { return (args._Ksize >= 4); }, - [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } + "a64_gemv_fp32_mla_32", + [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, + nullptr, + [](const GemmArgs &args) { return new GemvPretransposed(args); } }, + +// MMLA next due to higher throughput (SVE only) +#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32) { GemmMethod::GEMM_INTERLEAVED, - "interleaved_fp32_mmla_3VLx8", + "sve_interleaved_fp32_mmla_8x3VL", [](const GemmArgs &args) { return (args._Ksize>4); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif // __ARM_FEATURE_SVE && MMLA_FP32 #ifdef __ARM_FEATURE_SVE -// SVE smallk / hybrid methods +// SVE smallk / hybrid methods { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_fp32_mla_1VLx8", - [](const GemmArgs &args) { return (args._Ksize <= 24); }, + "sve_smallK_hybrid_fp32_mla_8x1VL", + [](const GemmArgs &args) { return args._Ksize <= 24 && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, - "hybrid_fp32_mla_4VLx4", - [](const GemmArgs &args) { return (args._Ksize >= 4); }, + "sve_hybrid_fp32_mla_8x1VL", + nullptr, + [](const GemmArgs &args) { return (args._Nsize < 12); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } +}, +{ + GemmMethod::GEMM_HYBRID, + "sve_hybrid_fp32_mla_6x4VL", + nullptr, [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, #endif // __ARM_FEATURE_SVE // NEON hybrid methods { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_fp32_mla_4x8", - [](const GemmArgs &args) { return (args._Ksize <= 8) && (args._Nsize % 4)==0; }, + "a64_smallK_hybrid_fp32_mla_8x4", + [](const GemmArgs &args) { return args._Ksize <= 8 && (args._Nsize % 4)==0 && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_fp32_mla_4x6", - [](const GemmArgs &args) { return (args._Ksize > 8) && (args._Ksize <= 16) && (args._Nsize % 4)==0; }, + "a64_smallK_hybrid_fp32_mla_6x4", + [](const GemmArgs &args) { return (args._Ksize > 8 && args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, - "hybrid_fp32_mla_4x8_normal", - [](const GemmArgs &args) { return (args._Ksize >= 4); }, + "a64_hybrid_fp32_mla_8x4", + nullptr, [](const GemmArgs &args) { return (args._Nsize < 12); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, GemmImplementation::with_estimate( GemmMethod::GEMM_HYBRID, - "hybrid_fp32_mla_16x4", - [](const GemmArgs &args) { return (args._Ksize >= 4); }, - [](const GemmArgs &args) { return GemmHybrid::estimate_cycles(args, hybrid_fp32_mla_16x4::get_performance_parameters(args._ci)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } + "a64_hybrid_fp32_mla_6x16", + nullptr, + [](const GemmArgs &args) { return GemmHybridIndirect::estimate_cycles(args, cls_a64_hybrid_fp32_mla_6x16::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } ), - #ifdef __ARM_FEATURE_SVE { GemmMethod::GEMM_INTERLEAVED, - "interleaved_fp32_mla_3VLx8", + "sve_interleaved_fp32_mla_8x3VL", [](const GemmArgs &args) { return (args._Ksize>4); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif // __ARM_FEATURE_SVE -// Pretranposed, 2D split -GemmImplementation::with_estimate( - GemmMethod::GEMM_INTERLEAVED_2D, - "sgemm_12x8_2d", - nullptr, - [](const GemmArgs &args) { return GemmInterleavedPretransposed2d::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } -), -// 1D split (with pretransposed or not) GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, - "sgemm_12x8_1d", + "a64_sgemm_8x12", nullptr, - [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); }, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return GemmInterleaved::estimate_cycles(args, cls_a64_sgemm_8x12::get_performance_parameters(args._ci)); }, + [](const GemmArgs &args) { return new GemmInterleaved(args); } ), #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index 7a983ed6ac..d702cffce1 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -77,51 +77,43 @@ class GemmHybrid : public GemmCommon { return args._cfg->inner_block_size; } - const unsigned int L1_size = args._ci->get_L1_cache_size(); + // Target block size (512 for FP32, scaling for other types). Don't block until size reaches 1.5X this. + unsigned int target_block_size = 2048 / sizeof(To); - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + if (args._Ksize >= ((3 * target_block_size) / 2)) { + unsigned int target_blocks = iceildiv(args._Ksize, target_block_size); - // Needs to be (at least a single) multiple of the K unroll level. - k_block /= strategy::k_unroll(); - k_block = std::max(k_block, 1U) * strategy::k_unroll(); + unsigned int block_size = iceildiv(args._Ksize, target_blocks); - // Now tune to presented problem size; this is how many blocks we need. - unsigned int numk_blocks = iceildiv(args._Ksize, k_block); + block_size = roundup(block_size, strategy::k_unroll()); - // So divide the space equally into that many blocks. - k_block = iceildiv(args._Ksize, numk_blocks); - - // And round UP to the K unroll level required. - k_block = roundup(k_block, strategy::k_unroll()); + return block_size; + } - return k_block; + return args._Ksize; } + // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a + // single block. static unsigned int compute_n_block(const GemmArgs &args) { if (args._cfg && args._cfg->outer_block_size) { return args._cfg->outer_block_size; } - const unsigned int k_block = compute_k_block(args); - const unsigned int L2_size = args._ci->get_L2_cache_size(); - - // n_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * k_block); + if (args._Nsize <= 64) { + return args._Nsize; + } - // Needs to be (at least a single) multiple of the kernel output width. - n_block /= strategy::out_width(); - n_block = std::max(n_block, 1U) * strategy::out_width(); + if ((args._Msize / args._Nsize) > 155) { + return args._Nsize; + } - // And tune to the presented problem size. - unsigned int numblocks = iceildiv(args._Nsize, n_block); - n_block = iceildiv(args._Nsize, numblocks); - n_block = roundup(n_block, strategy::out_width()); + // Go slightly wider if thread count and depth are small. + if ((args._Ksize <= 128) && (args._maxthreads <= 16)) { + return strategy::out_width() * 3; + } - return n_block; + return strategy::out_width(); } public: diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp new file mode 100644 index 0000000000..eede1a4f76 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -0,0 +1,621 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include +#include + +#include "arm_gemm.hpp" +#include "bias_adder.hpp" +#include "convolver.hpp" +#include "ndrange.hpp" +#include "performance_parameters.hpp" +#include "transform.hpp" +#include "utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +#ifndef UNUSED +#define __I_DEFINED_UNUSED +#define UNUSED(x) ((void)(x)) +#endif + +namespace arm_gemm { + +namespace { + +// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do +// that. + +template +class run_hybrid_kernel { +public: + template + static void run ( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const OutputStage &os, const int32_t *col_bias, unsigned int n_0 ); +}; + +template<> +template +void run_hybrid_kernel::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const Nothing &, const int32_t *, unsigned int) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); +#endif + UNUSED(kern_k); + + strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate); +} + +template<> +template +void run_hybrid_kernel::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *, Activation, bool, + const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); +#endif + UNUSED(kern_k); + + strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0); +} + +template<> +template +void run_hybrid_kernel::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const To *b_ptr, IndirectOutputArg output_arg, const Tr *, Activation, bool, + const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { + UNUSED(kern_k); + // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop. + assert(M <= strategy::out_height()); + // We don't yet support indirect output (as the quantizer can't do it). + assert(output_arg.is_indirect == false); + + // We need a row sum buffer and intermediate output buffer. + // These go on the stack as they are not too large, using an automatic array and alloca() respectively. + int32_t row_sums[strategy::out_height()]; + typename strategy::result_type *result_buffer; + + unsigned int output_width = roundup(N, strategy::out_width()); + + result_buffer = reinterpret_cast(alloca(output_width * strategy::out_height() * sizeof(typename strategy::result_type))); + + { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); +#endif + // Perform the GEMM, into the output buffer. + strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg(result_buffer, output_width), nullptr, Activation(), false); + } + + if (os.b_offset != 0) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (unsigned long)M * kern_k); +#endif + row_sums_indirect(num_strings, string_ptr, A_arg, M, row_sums, &os); + } else { + memset(row_sums, 0, sizeof(int32_t) * strategy::out_height()); + } + + { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (unsigned long)M * N); +#endif + // Quantize + requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0); + } +} + +} // anonymous namespace + +// Implementation of the GemmCommon abstract class. +template +class GemmHybridIndirect : public GemmCommon { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + GemmArgs _args; + OutputStage _os = {}; + + /* Quantized support (in addition to 'output stage' above) */ + int32_t *_col_bias = nullptr; + + const unsigned int _Ktotal; + const unsigned int _rounded_Ksize; + + /* Blocking info */ + const unsigned int _k_block; + const unsigned int _n_block; + const unsigned int _Mround; + + /* Pretransposed buffer. */ + const Toi *_B_transposed=nullptr; + + /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */ + const To * const * const * _indirect_buf = nullptr; + + /* Convolver - only set up for convolution problems, so also doubles as a flag. */ + std::unique_ptr> _convolver = nullptr; + + // Array of pointers to output rows +// Tr * const * _output_ptrs; + + const NDRange<4> _window_range; + + unsigned int get_col_sum_size() const { + if (std::is_same::value) { + return _args._Nsize * _args._nmulti * sizeof(int32_t); + } else { + return 0; + } + } + + static unsigned int get_ktotal(const GemmArgs &args) { + return args._Ksections * roundup(args._Ksize, strategy::k_unroll()); + } + + static unsigned int compute_k_block(const GemmArgs &args) { + // Some kernels don't support accumulate mode - these can't do K blocking at all. + if (!strategy::supports_accumulate() || std::is_same::value) { + return get_ktotal(args); + } + + if (args._cfg && args._cfg->inner_block_size) { + return args._cfg->inner_block_size; + } + + // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other + // datatypes); but don't divide into blocks until we hit 1.5X this size. + unsigned int target_block_size = 2048 / sizeof(To); + auto ktotal = get_ktotal(args); + + if (ktotal > ((target_block_size*3)/2)) { + unsigned int target_blocks = iceildiv(ktotal, target_block_size); + + unsigned int block_size = iceildiv(ktotal, target_blocks); + + block_size = roundup(block_size, strategy::k_unroll()); + + return block_size; + } + + return ktotal; + } + + // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a + // single block. + static unsigned int compute_n_block(const GemmArgs &args, const OutputStage os = {}) { + if (args._cfg && args._cfg->outer_block_size) { + return args._cfg->outer_block_size; + } + + if (args._Nsize <= 64) { + return args._Nsize; + } + + if ((args._Msize / args._Nsize) > 155) { + return args._Nsize; + } + + // "Asymmetric" quantizing GEMMs require a different approach - the tall skinny blocks we would otherwise + // use imply a great deal of repeated work performing the row sums. If row sums are involved, work out how + // much "column" parallelism is going to be required and set the block size accordingly. + if (std::is_same::value) { + const Requantize32 *qp = reinterpret_cast(&os); + + // Row sums only needed if b_offset isn't 0 + if (qp->b_offset != 0) { + // We can already parallelize across batches, multis and rows (in units of 'out_height') + int multi_row_parallelism = args._nmulti * args._nbatches * iceildiv(args._Msize, strategy::out_height()); + + // If this isn't enough, we will need to split up the columns too. + if (multi_row_parallelism < args._maxthreads) { + unsigned int columns_needed = iceildiv(args._maxthreads, multi_row_parallelism); + + unsigned int n_block = iceildiv(args._Nsize, columns_needed); + + return roundup(n_block, strategy::out_width()); + } + + // Multi/Batch/Row parallelism is enough - don't split up the columns. + return args._Nsize; + } + } + + if (args._Ksize <= 128 && args._maxthreads <= 16) { + return strategy::out_width() * 3; + } + + return strategy::out_width(); + } + +public: + GemmHybridIndirect(GemmHybridIndirect &) = delete; + GemmHybridIndirect & operator= (GemmHybridIndirect &) = delete; + + /* Constructor */ + GemmHybridIndirect(const GemmArgs &args, const OutputStage &os) + : _args(args), _os(os), _Ktotal(get_ktotal(args)), + _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())), + _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)), + _Mround(roundup(args._Msize, strategy::out_height())), + _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches, + iceildiv(args._Nsize, _n_block), args._nmulti) + { + // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the + // GemmConfig. Clear out the pointer to avoid accidents. + _args._cfg = nullptr; + } + + /* Constructor without OutputStage */ + GemmHybridIndirect(const GemmArgs &args) + : _args(args), _Ktotal(get_ktotal(args)), + _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())), + _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), + _Mround(roundup(args._Msize, strategy::out_height())), + _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches, + iceildiv(args._Nsize, _n_block), args._nmulti) + { + // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the + // GemmConfig. Clear out the pointer to avoid accidents. + _args._cfg = nullptr; + } + + // Interface implementation - Compulsory functions + ndrange_t get_window_size() const override { + return { _window_range.total_size() }; + } + + // This kernel can always be dynamically scheduled. + bool supports_dynamic_scheduling() const override { + return true; + } + + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { +#ifdef CYCLE_PROFILING + profiler prof; +#endif + strategy strat(_args._ci); + + std::vector in_row_ptrs; + std::vector in_row_strings; + std::vector string_lengths; + + // In convolution mode, we need input pointers. + if (_convolver) { + in_row_ptrs = std::vector(strategy::out_height() * _args._Ksections, nullptr); + in_row_strings = std::vector(_args._Ksections, nullptr); + + for (unsigned int i=0; i<_args._Ksections; i++) { + in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]); + } + } + + // In any indirect mode, we need the string lengths. + if (_args._indirect_input) { + string_lengths = std::vector(_args._Ksections, 0); + } + + /* Make sure we've been set up correctly. */ + assert(_B_transposed); + static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); +// static_assert(std::is_same::value, "gemm_native: Result types must be the same."); + + /* For now, each work item implies all the K for a given output + * pixel (so we don't need to synchronize access to the output + * array). So separate the loop over K blocks here. */ + for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) { + unsigned int kmax = std::min(k0 + _k_block, _Ktotal); + unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll()); + + const bool first_pass = (k0 == 0); + const bool last_pass = (kmax == _Ktotal); + + unsigned int first_section = (k0 / _rounded_Ksize); + unsigned int first_offset = (k0 % _rounded_Ksize); + unsigned int kleft = kern_k; + unsigned int sections=0; + unsigned int offset = first_offset; + + if (_args._indirect_input) { + while (kleft) { + // When chopping into sections: the amount that goes into 'string_lengths' is the amount to be + // processed (excluding padding). But the amount we subtract from 'kleft' takes account of any + // padding applied. + string_lengths[sections] = std::min(kleft, _args._Ksize - offset); + kleft -= std::min(kleft, _rounded_Ksize - offset); + sections++; + offset=0; + } + } + + auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0)); + + if (p.done()) { + return; + } + + // Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call. + // The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results). + // THe convolution path only generates the pointers for one block of rows at a time. + const bool process_all_rows = (!SeparateQuantize && !_convolver); + + do { + const unsigned int m_start = p.dim(0) * strategy::out_height(); + const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize); +// const unsigned int m_end = std::min(m_start + strategy::out_height(), _args._Msize); + const unsigned int batch = p.dim(1); + const unsigned int n0 = p.dim(2) * _n_block; + const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize); + const unsigned int multi = p.dim(3); + + const Toi *b_panel = _B_transposed + + (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) + + (k0 * roundup(_args._Nsize, strategy::out_width())) + + (n0 * kern_k); + + IndirectOutputArg out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc); + +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); +#endif + if (_indirect_buf) { + run_hybrid_kernel::run( +#ifdef CYCLE_PROFILING + prof, +#endif + strat, sections, string_lengths.data(), + IndirectInputArg(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset), + (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, + last_pass ? _args._act : Activation(), + !first_pass, + // Quantization parameters + _os, _col_bias+(multi * _args._Nsize), n0); + } else if (_convolver) { + auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize); + + unsigned int pos=0; + auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start); + + while (!conv_rows.finished()) { + unsigned int width, conv_offset; + + assert(pos < sections); + + std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()])); + + if (pos==0) { + assert(conv_offset == first_offset); + } + assert(width == string_lengths[pos]); + pos++; + } + assert(pos == sections); + + run_hybrid_kernel::run( +#ifdef CYCLE_PROFILING + prof, +#endif + strat, sections, string_lengths.data(), + IndirectInputArg(in_row_strings.data(), 0, first_offset), + (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, + last_pass ? _args._act : Activation(), + !first_pass, + // Quantization parameters + _os, _col_bias+(multi * _args._Nsize), n0); + } else { + // Length to process. This needs to exclude padding, but 'kmax' potentially includes it. + const unsigned int len = (std::min(_args._Ksize, kmax) - k0); + + run_hybrid_kernel::run( +#ifdef CYCLE_PROFILING + prof, +#endif + strat, 1, &len, + IndirectInputArg(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda), + (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, + last_pass ? _args._act : Activation(), + !first_pass, + // Quantization parameters + _os, _col_bias+(multi * _args._Nsize), n0); + } + } while (process_all_rows ? p.next_dim1() : p.next_dim0()); + } + } + + // Interface implementation - pretransposed + bool B_is_pretransposed() const override { + return true; + } + + bool B_pretranspose_required() const override { + return (_B_transposed==nullptr); + } + + size_t get_B_pretransposed_array_size() const override { + // Start with actual pretransposed buffer... + size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi); + + // Space for result row pointers (not strictly needed any more but retained for indirect output testing) + size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *); + + if (std::is_same::value) { + size += get_col_sum_size(); + } + + return size; + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + if (std::is_same::value) { + _col_bias = reinterpret_cast(in_buffer); + + Requantize32 *qp_ptr = reinterpret_cast(&_os); + + for (unsigned int i=0; i<_args._nmulti; i++) { + // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size. + compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0); + } + } + + // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 + uintptr_t buffer_int = reinterpret_cast(in_buffer); + Toi *buffer = reinterpret_cast(buffer_int + get_col_sum_size()); + _B_transposed = buffer; + + strategy strat(_args._ci); + + for (unsigned int multi=0; multi<_args._nmulti; multi++) { + for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) { + const unsigned int kmax=std::min(k0 + _k_block, _Ktotal); + + /* Figure out the size of each block. */ + unsigned int k_size = kmax - k0; + + // We need to insert padding at the end of each K section. + // The computation needed is a little delicate - the coordinates from the block walker are expressed in + // terms of the full, padded, _Ktotal. + // But we need to transform each section with reference to the original, unpadded, input, letting the + // transform pad each section as needed. + + // This is needed for computations below. + const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll()); + + // The expected output format is also an entire columns interleaved, then the next set of + // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at + // a time. + for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){ + unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize); + + // Track where we are and how much work is left. + unsigned int kpos = k0; + unsigned int kleft = k_size; + + while (kleft) { + // Which section are we in? Based on the rounded-up section size. + unsigned int k_section_base = kpos / rounded_section_size; + // How far into the section are we? + unsigned int k_offset = kpos - (k_section_base * rounded_section_size); + + // We will either copy the rest of this section, or to the end of the requested length. + unsigned int k_length = std::min(_args._Ksize - k_offset, kleft); + + strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, + x0, xmax, + (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. + (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. + + // We need to modify our position based on the ROUNDED version of what we just did. + unsigned int padded_length = roundup(k_length, strategy::k_unroll()); + + buffer += strategy::out_width() * padded_length; + + kpos += padded_length; + kleft -= padded_length; + } + } + } + } + } + + void set_pretransposed_B_data(void *in_buffer) override { + // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 + uintptr_t buffer_int = reinterpret_cast(in_buffer); + _B_transposed = reinterpret_cast(buffer_int + get_col_sum_size()); + _col_bias = reinterpret_cast(in_buffer); + } + + // Estimate cycles for given problem given provided parameters + static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { + // Note: Current hybrid kernels don't actually round up height (they + // have paths for each possible height). Might need to make this + // configurable in future. + uint64_t total_macs = static_cast(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll()); + + float mac_cycles = static_cast(total_macs) / params.kernel_macs_cycle; + + // TODO: A bit of a kludge here: current hybrid kernels incur extra + // overhead where the width is not a multiple of kernel width. It's + // most noticable where the overall width is quite low, so add 15% + // penalty for such widths. + if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) { + mac_cycles *= 1.15f; + } + + uint64_t total_cycles = mac_cycles; + + return total_cycles; + } + + void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override { + if (std::is_same::value) { + Requantize32 *qp = reinterpret_cast(&_os); + + qp->bias = bias; + qp->bias_multi_stride = bias_multi_stride; + } + } + + void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override { + assert(string_len == _args._Ksize); + _indirect_buf = ptr; + } + + void set_convolution_parameters(ConvolutionParameters parms) override { + assert(parms.input_channels == _args._Ksize); + _convolver = std::unique_ptr>(new convolver(parms)); + } +}; + +} // namespace arm_gemm + +#ifdef __I_DEFINED_UNUSED +#undef UNUSED +#endif diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp index 915227fc29..7a5fa87ee6 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp @@ -118,18 +118,27 @@ class GemmHybridQuantized : public GemmCommon { // n_block: Work out how many rows (of length k_block) will fit in the L2 // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * k_block); + const unsigned int scaled_l2_size = (L2_size * 9) / 10; + const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()); + + // .. if the L1 contents is bigger than the L2, just return a minimal size block. + if (k_block_area > scaled_l2_size) { + return strategy::out_width(); + } + + unsigned int n_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block); // Needs to be (at least a single) multiple of the kernel output width. n_block /= strategy::out_width(); - n_block = std::max(n_block, 1U) * strategy::out_width(); + n_block = std::max(n_block, 1u) * strategy::out_width(); // And tune to the presented problem size. unsigned int numblocks = iceildiv(args._Nsize, n_block); n_block = iceildiv(args._Nsize, numblocks); n_block = roundup(n_block, strategy::out_width()); + assert(n_block > 0); + return n_block; } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp new file mode 100644 index 0000000000..7376b5ffe3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2017-2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include + +#include "arm_gemm.hpp" +#include "ndrange.hpp" +#include "utils.hpp" + +#include "mergeresults.hpp" +#include "transform.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +namespace arm_gemm { + +// Implementation of the GemmCommon abstract class. +template +class GemmHybridQuantizedInline : public GemmCommon { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + /* const properties set by constructor */ + const CPUInfo * const _ci; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + const unsigned int _nbatches; + const unsigned int _nmulti; + + /* Blocking info */ + const unsigned int _k_block; + const unsigned int _n_block; + const unsigned int _Mround; + + /* Pretransposed buffer. */ + const Toi *_B_transposed=nullptr; + + const NDRange<4> _window_range; + + Requantize32 _qp; + int32_t *col_bias = nullptr; + + void *working_space = nullptr; + + unsigned int _nthreads; + + unsigned int get_col_sum_size() const { + return _Nsize * _nmulti * sizeof(int32_t); + } + + static unsigned int compute_k_block(const GemmArgs &args) { + // We don't support K blocks as we only temporarily store 32 bit results. + return args._Ksize; + + if (args._cfg && args._cfg->inner_block_size) { + return args._cfg->inner_block_size; + } + + const unsigned int L1_size = args._ci->get_L1_cache_size(); + + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + k_block /= strategy::k_unroll(); + k_block = std::max(k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int numk_blocks = iceildiv(args._Ksize, k_block); + + // So divide the space equally into that many blocks. + k_block = iceildiv(args._Ksize, numk_blocks); + + // And round UP to the K unroll level required. + k_block = roundup(k_block, strategy::k_unroll()); + + return k_block; + } + + static unsigned int compute_n_block(const GemmArgs &args) { + if (args._cfg && args._cfg->outer_block_size) { + return args._cfg->outer_block_size; + } + + const unsigned int k_block = compute_k_block(args); + const unsigned int L2_size = args._ci->get_L2_cache_size(); + + // n_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + n_block /= strategy::out_width(); + n_block = std::max(n_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int numblocks = iceildiv(args._Nsize, n_block); + n_block = iceildiv(args._Nsize, numblocks); + n_block = roundup(n_block, strategy::out_width()); + + return n_block; + } + +public: + GemmHybridQuantizedInline(GemmHybridQuantizedInline &) = delete; + GemmHybridQuantizedInline & operator= (GemmHybridQuantizedInline &) = delete; + + /* Constructor */ + GemmHybridQuantizedInline(const GemmArgs &args, const Requantize32 &qp) + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _nbatches(args._nbatches), _nmulti(args._nmulti), + _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), + _Mround(roundup(args._Msize, strategy::out_height())), + _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti), + _qp (qp), _nthreads(args._maxthreads) { } + + // Interface implementation - Compulsory functions + ndrange_t get_window_size() const override { + return { _window_range.total_size() }; + } + + // This kernel can always be dynamically scheduled. + bool supports_dynamic_scheduling() const override { + return true; + } + + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { +#ifdef CYCLE_PROFILING + profiler prof; +#endif + strategy strat(_ci); + + /* Make sure we've been set up correctly. */ + assert(_B_transposed); + static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); + + /* For now, each work item implies all the K for a given output + * pixel (so we don't need to synchronize access to the output + * array). So separate the loop over K blocks here. */ + for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { + unsigned int kmax = std::min(k0 + _k_block, _Ksize); + unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll()); + + auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0)); + + if (p.done()) { + return; + } + + do { + const unsigned int m_start = p.dim(0) * strategy::out_height(); + const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize); + const unsigned int batch = p.dim(1); + const unsigned int n0 = p.dim(2) * _n_block; + const unsigned int nmax = std::min(n0 + _n_block, _Nsize); + const unsigned int multi = p.dim(3); + + const Toi *b_panel = _B_transposed + + (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) + + (k0 * roundup(_Nsize, strategy::out_width())) + + (n0 * kern_k); + + { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); +#endif + strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, + b_panel, + this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, + (m_end - m_start), (nmax - n0), kmax - k0, + col_bias + (multi * _Nsize) + n0, _qp); + } + } while (p.next_dim1()); + } + } + + // Interface implementation - pretransposed + bool B_is_pretransposed() const override { + return true; + } + + bool B_pretranspose_required() const override { + return (_B_transposed==nullptr); + } + + size_t get_B_pretransposed_array_size() const override { + return get_col_sum_size() + (roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi)); + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + col_bias = reinterpret_cast(in_buffer); + + for (unsigned int i=0; i<_nmulti; i++) { + compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize, i, 0); + } + + uintptr_t buffer_int = reinterpret_cast(in_buffer); + Toi *buffer = reinterpret_cast(buffer_int + get_col_sum_size()); + _B_transposed = buffer; + strategy strat(_ci); + + for (unsigned int multi=0; multi<_nmulti; multi++) { + for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { + const unsigned int kmax = std::min(k0 + _k_block, _Ksize); + const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll()); + + for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) { + const unsigned int xmax = std::min(x0+_n_block, _Nsize); + + const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size; + + strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb, + x0, xmax, k0, kmax); + + buffer += size; + } + } + } + } + + void set_pretransposed_B_data(void *in_buffer) override { + uintptr_t buffer_int = reinterpret_cast(in_buffer); + _B_transposed = reinterpret_cast(buffer_int + get_col_sum_size()); + col_bias = reinterpret_cast(in_buffer); + } + + void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override { + _qp.bias = bias; + _qp.bias_multi_stride = bias_multi_stride; + } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp index 261e7d2d9c..f6a0fc5d52 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp @@ -37,9 +37,9 @@ template struct GemmImplementation { const GemmMethod method; const char * name; - std::function is_supported; - std::function cycle_estimate; - std::function *(const GemmArgs &, const OutputStage &)> instantiate; + std::function is_supported = {}; + std::function cycle_estimate = {}; + std::function *(const GemmArgs &, const OutputStage &)> instantiate = {}; bool do_is_supported(const GemmArgs &args, const OutputStage &os) const { if (is_supported != nullptr) { @@ -57,13 +57,13 @@ struct GemmImplementation { } } - GemmImplementation(const GemmImplementation &) = default; - GemmImplementation &operator= (const GemmImplementation &) = default; - GemmCommon *do_instantiate(const GemmArgs &args, const OutputStage &os) const { return instantiate(args, os); } + GemmImplementation(const GemmImplementation &) = default; + GemmImplementation & operator= (const GemmImplementation &) = default; + GemmImplementation(GemmMethod m, const char *n, std::function is_supported, std::function is_recommended, std::function *(const GemmArgs &, const OutputStage &)> instantiate) : @@ -79,9 +79,9 @@ template struct GemmImplementation { const GemmMethod method; const char * name; - std::function is_supported; - std::function cycle_estimate; - std::function *(const GemmArgs &)> instantiate; + std::function is_supported = {}; + std::function cycle_estimate = {}; + std::function *(const GemmArgs &)> instantiate = {}; bool do_is_supported(const GemmArgs &args, const Nothing &) const { if (is_supported != nullptr) { @@ -103,7 +103,6 @@ struct GemmImplementation { return instantiate(args); } - static GemmImplementation with_estimate(GemmMethod m, const char *n, std::function is_supported, std::function cycle_estimate, std::function *(const GemmArgs &)> instantiate) { @@ -116,7 +115,10 @@ struct GemmImplementation { return impl; } - GemmImplementation(GemmMethod m, const char * n) : method(m), name(n), is_supported(nullptr), cycle_estimate(nullptr), instantiate(nullptr) {} + GemmImplementation(const GemmImplementation &) = default; + GemmImplementation & operator= (const GemmImplementation &) = default; + + GemmImplementation(GemmMethod m, const char * n) : method(m), name(n) {} GemmImplementation(GemmMethod m, const char *n, std::function is_supported, std::function is_recommended, @@ -124,9 +126,6 @@ struct GemmImplementation { method(m), name(n), is_supported(is_supported), cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ), instantiate(instantiate) { } - - GemmImplementation(const GemmImplementation &) = default; - GemmImplementation &operator=(const GemmImplementation &) = default; }; /* "Master" function implemented for each valid combination of types. @@ -211,6 +210,7 @@ std::vector get_compatible_kernels(const GemmArgs &args, cons for (const GemmImplementation *i = gemms; i->method != GemmMethod::DEFAULT; i++) { /* Check that this implementation supports the presented problem. */ + if (!i->do_is_supported(args, os)) { continue; } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp index da682330a0..a3a61959c3 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp @@ -28,17 +28,17 @@ #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" -#include "kernels/a64_gemm_s16_12x8.hpp" +#include "kernels/a64_gemm_s16_8x12.hpp" namespace arm_gemm { static const GemmImplementation gemm_s16_methods[] = { { GemmMethod::GEMM_INTERLEAVED, - "gemm_s16_12x8", + "a64_gemm_s16_8x12", nullptr, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, { GemmMethod::DEFAULT, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index 147caeefbd..31f225002e 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -26,21 +26,22 @@ #include "arm_gemm.hpp" #include "gemm_common.hpp" #include "gemm_hybrid.hpp" +#include "gemm_hybrid_indirect.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" -#include "gemm_interleaved_pretransposed_2d.hpp" -#include "kernels/a64_gemm_s16_12x8.hpp" -#include "kernels/a64_gemm_s8_12x8.hpp" +#include "kernels/a64_gemm_s16_8x12.hpp" +#include "kernels/a64_gemm_s8_8x12.hpp" #include "kernels/a64_gemm_s8_4x4.hpp" -#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp" -#include "kernels/a64_interleaved_s8s32_mmla_12x8.hpp" -#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp" -#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp" -#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp" -#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp" -#include "kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp" -#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp" +#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp" +#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp" +#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp" +#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp" + +#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp" +#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp" +#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp" +#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp" namespace arm_gemm { @@ -49,106 +50,84 @@ static const GemmImplementation gemm_s8_methods[] = { #ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, - "interleaved_s8s32_mmla_3VLx8", + "sve_interleaved_s8s32_mmla_8x3VL", [](const GemmArgs &args) { return (args._Ksize>8); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_s8s32_dot_1VLx8", - [](const GemmArgs &args) { return args._Ksize<=64; }, + "sve_smallK_hybrid_s8s32_dot_8x1VL", + [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, - "hybrid_s8s32_dot_4VLx4", + "sve_hybrid_s8s32_dot_6x4VL", [](const GemmArgs &args) { return args._Ksize>=16; }, [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "interleaved_s8s32_dot_3VLx8", + "sve_interleaved_s8s32_dot_8x3VL", [](const GemmArgs &args) { return (args._Ksize>4); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, -#endif +#endif // SVE #ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, - "interleaved_s8s32_mmla_12x8", + "a64_interleaved_s8s32_mmla_8x12", [](const GemmArgs &args) { return (args._Ksize>8); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_s8s32_dot_4x8", - [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); }, + "a64_smallK_hybrid_s8s32_dot_8x4", + [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_s8s32_dot_4x6", - [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); }, + "a64_smallK_hybrid_s8s32_dot_6x4", + [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, -{ - GemmMethod::GEMM_HYBRID, - "hybrid_s8s32_dot_16x4", - [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; }, - [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, -{ - GemmMethod::GEMM_INTERLEAVED_2D, - "gemm_s8_12x8_2d", - [](const GemmArgs &args) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "gemm_s8_12x8_1d", - [](const GemmArgs &args) { return args._ci->has_dotprod(); }, + "a64_gemm_s16_8x12", nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Ksize>4; }, + [](const GemmArgs &args) { return new GemmInterleaved(args); }, }, { - GemmMethod::GEMM_INTERLEAVED_2D, - "gemm_s16_12x8_2d", - nullptr, - [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4 && (args._Msize / args._maxthreads) < 8; }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); }, + GemmMethod::GEMM_HYBRID, + "a64_hybrid_s8s32_dot_6x16", + [](const GemmArgs &args) { return args._ci->has_dotprod(); }, + [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "gemm_s16_12x8_1d", - nullptr, - [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; }, - [](const GemmArgs &args) { return new GemmInterleaved(args); }, -}, -{ - GemmMethod::GEMM_INTERLEAVED_2D, - "gemm_s8_4x4_2d", + "a64_gemm_s8_8x12", + [](const GemmArgs &args) { return args._ci->has_dotprod(); }, nullptr, - [](const GemmArgs &args) { return ((args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8)) || - ((args._Msize / args._maxthreads) < 4); }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "gemm_s8_4x4_1d", + "a64_gemm_s8_4x4", nullptr, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, { GemmMethod::DEFAULT, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index c4dceef922..92c1086a5f 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -27,11 +27,12 @@ #include #include "arm_gemm.hpp" -#include "utils.hpp" - +#include "convolver.hpp" #include "mergeresults.hpp" #include "performance_parameters.hpp" +#include "quantized.hpp" #include "transform.hpp" +#include "utils.hpp" #ifdef CYCLE_PROFILING #include "profiler.hpp" @@ -46,12 +47,212 @@ // // This implementation interleaves the source matrices in blocks - good for // larger matrices. + namespace arm_gemm { -template +namespace { + +// Some kernels output to a linear buffer and require a separate merge step. +// Others output directly to the matrix result. This helper class calls the +// appropriate functions, using templating to avoid calling non-existent +// functions. +template +class kernel_and_merge { +public: + template + static void run ( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, + unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr, + const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias, + Tab *acc_buff); +}; + +// Run a kernel and call the separate merge step +template<> +template +void kernel_and_merge::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, + unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr, + const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *) +{ + const int bblocks = iceildiv(n_max - n_0, strategy::out_width()); + + { +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k)); +#endif + + strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + } + + { +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr))); +#endif + strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate); + } +} + +// Run a kernel with integrated merge +template<> +template +void kernel_and_merge::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, Tri *, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, + unsigned int n_0, unsigned int n_max, const Tr *biasptr, + const Activation &act, bool accumulate, const Nothing &, const int32_t *, + Tab *acc_buff) +{ +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k); +#endif + + // We need to offset the C pointer, but as it might be NULL (requesting output to accumulation buffer) we need + // to be careful not to offset a null pointer. + Tri *offset_c_ptr; + + if (c_ptr == nullptr) { + offset_c_ptr = nullptr; + } else { + offset_c_ptr = c_ptr + m_0 * ldc + n_0; + } + + strat.kernel(// A and B pointers are just the packed panels. + a_ptr, b_panel, + // Provide relevant part of output array and row stride. + offset_c_ptr, ldc, + // M, N, K sizes + m_max-m_0, n_max - n_0, kern_k, + // Bias, activation, accumulation. Need to offset the bias as needed. + biasptr ? biasptr + n_0 : nullptr, act, accumulate, + // Accumulation buffer. + acc_buff ); +} + +// Run a kernel with integrated merge, quantizing +template<> +template +void kernel_and_merge::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, Tri *, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, + unsigned int n_0, unsigned int n_max, const Tr *, + const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias, + Tab *acc_buff) +{ +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k); +#endif + + strat.kernel(// A and B pointers are just the packed panels. + a_ptr, b_panel, + // Provide relevant part of output array and row stride. + c_ptr + m_0 * ldc + n_0, ldc, + // M, N, K sizes + m_max-m_0, n_max - n_0, kern_k, + // Bias, activation, accumulation. Need to offset the bias as needed. + col_bias + n_0, qp, n_0, accumulate, acc_buff); +} + +// Run a kernel and call the separate quantize step +template<> +template +void kernel_and_merge::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, + unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *, + const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias, + Tab *) +{ + const int bblocks = iceildiv(n_max - n_0, strategy::out_width()); + + { +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k)); +#endif + + strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + } + + { +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr))); +#endif + // The interleaved kernel outputs in blocks - each block is a + // row-major matrix of size out_width * out_height. The merge + // kernels are designed to deal with this but the requantizer is + // not, so we need to requantize one block at a time. + for (int i=0; i(a_ptr + strategy::out_height() * kern_k); + + requantize_block_32(qp, (n_end - n_start), (m_max-m_0), + c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(), + c_ptr + m_0 * ldc + n_start, ldc, + row_bias, col_bias + n_start, n_start); + } + } +} + +// Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in +// "requantizing" context where the output will be requantized. +// +// These require different input transforms, as if we are requantizing we want to sum the rows of the A input, and +// if we are not we don't. +// +// This helper class allows the appropriate transforms to be found, without requiring kernels that don't support +// quantization to define useless "quantized" transforms. +template +class transform_type { +public: + typedef decltype(strategy::transforms) type; +}; + +template +class transform_type { +public: + typedef decltype(strategy::transforms_quantized) type; +}; + +// We need a similar trick here to figure out what type the accumulator buffer should be. +template +class accumulate_buffer_type { +public: + typedef typename strategy::result_type type; +}; + +template +class accumulate_buffer_type { +public: + typedef int32_t type; +}; + +} // anonymous namespace + +template class GemmInterleaved : public GemmCommon { typedef typename strategy::operand_type Toi; typedef typename strategy::result_type Tri; + typedef typename accumulate_buffer_type::type Tab; /* const properties set by constructor */ const CPUInfo * const _ci; @@ -59,10 +260,15 @@ class GemmInterleaved : public GemmCommon { const unsigned int _Msize; const unsigned int _Nsize; const unsigned int _Ksize; + const unsigned int _Ksections; + const unsigned int _Ktotal; + const unsigned int _rounded_Ksize; const unsigned int _nbatches; const unsigned int _nmulti; + const bool _thread_columns; + const Activation _act; const int _maxthreads; @@ -77,30 +283,59 @@ class GemmInterleaved : public GemmCommon { const Toi *_B_transposed=nullptr; void *_working_space=nullptr; + Tab *_accumulation_buffer=nullptr; + + /* Output stage */ + OutputStage _os; + + /* Quantized support (in addition to 'output stage' above */ + int32_t *col_bias = nullptr; + + /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */ + const To * const * const * _indirect_buf = nullptr; + + /* Convolver - only set up for convolution problems, so also doubles as a flag. */ + std::unique_ptr> _convolver = nullptr; + + unsigned int get_col_sum_size() const { + if (std::is_same::value) { + return _Nsize * _nmulti * sizeof(int32_t); + } else { + return 0; + } + } + /* We will need to walk through the blocks of B in a few contexts, so * factor that out. */ class blockwalker { private: /* Size loops, etc. based on our parent's configuration */ - const GemmInterleaved &_parent; + const GemmInterleaved &_parent; /* K, X and multi parameters for current iteration. */ unsigned int _k0=0, _x0=0, _multi=0; + /* Range of X to iterate over - used in "ForceThreadColumns" cases */ + unsigned int _x_start=0; + unsigned int _x_end=_parent._Nsize; + unsigned int _index=0; bool _done=false; bool _newkblock=true; bool _newmulti=true; public: - blockwalker(const GemmInterleaved &parent) : _parent(parent) { } + blockwalker(const GemmInterleaved &parent) : _parent(parent) { } + + blockwalker(const GemmInterleaved &parent, + unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { } unsigned int xmax() { - return std::min(_x0 + _parent._x_block, _parent._Nsize); + return std::min(_x0 + _parent._x_block, _x_end); } unsigned int kmax() { - return std::min(_k0 + _parent._k_block, _parent._Ksize); + return std::min(_k0 + _parent._k_block, _parent._Ktotal); } /* Advance to the next block, return false at the end. */ @@ -111,10 +346,10 @@ class GemmInterleaved : public GemmCommon { _newkblock=false; _x0 += _parent._x_block; - if (_x0 >= _parent._Nsize) { - _x0=0; + if (_x0 >= _x_end) { + _x0=_x_start; _k0 += _parent._k_block; - if (_k0 >= _parent._Ksize) { + if (_k0 >= _parent._Ktotal) { _k0=0; _multi++; if (_multi >= _parent._nmulti) { @@ -138,14 +373,125 @@ class GemmInterleaved : public GemmCommon { bool newkblock(void) { return _newkblock; } }; - // A working size: One of these needed, regardless of thread count. Divided according to window. + // "k block" has two distinct uses: figuring out which iterations of K + // to actually process, but also various size/pointer computations. The + // latter needs to take account of the extra space needed for the row + // sums, if appropriate. + unsigned int get_total_k_depth() const { + unsigned int k_depth = _k_block; + + if (std::is_same::value) { + k_depth += sizeof(int32_t) / sizeof(Toi); + } + + return k_depth; + } + + // A working size. size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches); + if (_thread_columns) { + // For 2D threading: allocate a buffer of one block of rows per thread + return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads); + } else { + // For 1D threaded: one of these needed, regardless of thread count. Divided according to window. + return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches); + } } - // C working size: One needed per thread. + // C working size: One needed per thread. Not needed if there is no merge step. size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); + if (MergeStep) { + return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); + } else { + return 0; + } + } + + // Accumulation buffer size + size_t get_accumulation_buffer_size() const { + // We only support an accumulation buffer for non-merge cases. + if (MergeStep) { + return 0; + } + + // Check if we are actually blocking + if (_k_block == _Ktotal) { + return 0; + } + + // We are no-merge, non-quantized with active blocking: accumulation buffer needed. + size_t size_per_buffer = sizeof(Tab) * strategy::out_height() * strategy::out_width(); + size_t num_buffers = iceildiv(_Msize, strategy::out_height()) * iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti; + + return num_buffers * size_per_buffer; + } + + // Get pointer into accumulation buffer + Tab *get_accumulation_buffer(unsigned int M, unsigned int N, unsigned int batch, unsigned int multi) const { + // Don't do anything if there's no buffer. + if (_accumulation_buffer == nullptr) { + return nullptr; + } + + // Here we are indexing an appropriately sized pointer, so no sizeof() needed to convert to bytes. + size_t size_per_buffer = strategy::out_height() * strategy::out_width(); + + size_t buffer_rows = iceildiv(_Msize, strategy::out_height()); + size_t buffer_cols = iceildiv(_Nsize, strategy::out_width()); + size_t buffers_per_batch = (buffer_rows * buffer_cols); + size_t buffers_per_multi = buffers_per_batch * _nbatches; + + // M/N must reference the top-left corner of a block. + size_t row = M / strategy::out_height(); + assert(M % strategy::out_height() == 0); + size_t col = N / strategy::out_width(); + assert(N % strategy::out_width() == 0); + + size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col; + + return _accumulation_buffer + (buffer_index * size_per_buffer); + } + + int32_t row_sum_multiplier() const { + if (std::is_same::value) { + const Requantize32 *qp = reinterpret_cast(&_os); + + return -qp->b_offset; + } + + return 0; + } + + // Heuristics to decide whether to use the 'thread columns' regime + static bool is_thread_columns(const GemmArgs &args) { + // For now, there is a templace parameter to force it. + if (ForceThreadColumns) { + return true; + } + + // Never do this for single threaded cases. + if (args._maxthreads == 1) { + return false; + } + + // How many blocks of work are available for threading on M? + int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches; + + // If we just can't share the work across threads with the row threading regime. + if (args._maxthreads > m_blocks) { + return true; + } + + // If the row threading regime is too wasteful (20% threshold) + if (((roundup(m_blocks, args._maxthreads) * 100) / m_blocks) > 120) { + return true; + } + + return false; + } + + static unsigned int get_ktotal(const GemmArgs &args) { + return args._Ksections * roundup(args._Ksize, strategy::k_unroll()); } static unsigned int get_k_block_size(const GemmArgs &args) { @@ -153,6 +499,11 @@ class GemmInterleaved : public GemmCommon { return args._cfg->inner_block_size; } + // K blocking not supported if we are requantizing. + if (std::is_same::value) { + return get_ktotal(args); + } + const unsigned int L1_size = args._ci->get_L1_cache_size(); unsigned int k_block; @@ -165,58 +516,84 @@ class GemmInterleaved : public GemmCommon { k_block = std::max(k_block, 1U) * strategy::k_unroll(); // Now tune to presented problem size; this is how many blocks we need. - unsigned int num_k_blocks = iceildiv(args._Ksize, k_block); + unsigned int num_k_blocks = iceildiv(get_ktotal(args), k_block); // So divide the space equally into that many blocks. - k_block = iceildiv(args._Ksize, num_k_blocks); + k_block = iceildiv(get_ktotal(args), num_k_blocks); // And round UP to the K unroll level required. k_block = roundup(k_block, strategy::k_unroll()); + assert(k_block > 0); + return k_block; } -public: - GemmInterleaved(GemmInterleaved &) = delete; - GemmInterleaved & operator= (GemmInterleaved &) = delete; - - /* Constructor */ - GemmInterleaved(const GemmArgs &args) - : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), - _nbatches(args._nbatches), _nmulti(args._nmulti), - _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads), - _k_block(get_k_block_size(args)) { - const unsigned int L2_size = _ci->get_L2_cache_size(); - - assert(_maxthreads > 0); + static unsigned int get_x_block_size(const GemmArgs &args) { + if (is_thread_columns(args)) { + // In 2D mode, override X block, because we will process width first. + return roundup(args._Nsize, strategy::out_width()); + } - // Work out blocking parameters, or override from provided GemmConfig - // TODO: Move outer block into a static function too. if (args._cfg && args._cfg->outer_block_size) { - _x_block = args._cfg->outer_block_size; - } else { - // x_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * _k_block); + return roundup(args._cfg->outer_block_size, strategy::out_width()); + } - // Needs to be (at least a single) multiple of the kernel output width. - _x_block /= strategy::out_width(); - _x_block = std::max(_x_block, 1U) * strategy::out_width(); + unsigned int x_block; + const unsigned int L2_size = args._ci->get_L2_cache_size(); + const unsigned int k_block = get_k_block_size(args); - // And tune to the presented problem size. - unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); - _x_block = iceildiv(_Nsize, num_x_blocks); + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + const unsigned int scaled_l2_size = (L2_size * 9) / 10; + const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()); - _x_block = iceildiv(_x_block, strategy::out_width()); - _x_block *= strategy::out_width(); + // .. if the L1 contents is bigger than the L2, just return a minimal size block. + if (k_block_area > scaled_l2_size) { + return strategy::out_width(); } - // Work out the rounded size of M - needed for some buffers. - _Mround = iceildiv(_Msize, strategy::out_height()); - _Mround *= strategy::out_height(); + x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + x_block /= strategy::out_width(); + x_block = std::max(x_block, 1u) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int num_x_blocks = iceildiv(args._Nsize, x_block); + x_block = iceildiv(args._Nsize, num_x_blocks); + + x_block = roundup(x_block, strategy::out_width()); + + assert(x_block > 0); + + return x_block; } +public: + GemmInterleaved(GemmInterleaved &) = delete; + GemmInterleaved & operator= (GemmInterleaved &) = delete; + + /* Constructor */ + GemmInterleaved(const GemmArgs &args, const OutputStage &os) + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _Ksections(args._Ksections), _Ktotal(get_ktotal(args)), + _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())), + _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)), + _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads), + _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())), + _os(os) { } + + /* Constructor without OutputStage */ + GemmInterleaved(const GemmArgs &args) + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _Ksections(args._Ksections), _Ktotal(get_ktotal(args)), + _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())), + _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)), + _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads), + _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())), + _os() { } + // Interface implementation - Compulsory functions // Window size: Only the last thread should do a ragged block, so dole @@ -224,8 +601,14 @@ public: // not multi for now (as this would cause problems with the buffer // manager). ndrange_t get_window_size() const override { - // _Mround is a multiple of out_height by definition. - return { (_Mround / strategy::out_height()) * _nbatches }; + unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches; + + if (_thread_columns) { + return { row_blocks, iceildiv(_Nsize, strategy::out_width()) }; + } else { + // _Mround is a multiple of out_height by definition. + return { row_blocks }; + } } // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. @@ -235,117 +618,262 @@ public: // Execute void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override { - const auto start = work_range.get_position(0); - const auto end = work_range.get_position_end(0); #ifdef CYCLE_PROFILING profiler prof; #endif + + /* Make sure we've been set up correctly. */ + assert(_B_transposed); + assert(_working_space); + int8_t *working_space_bytes = reinterpret_cast(_working_space); + + /* Align if needed */ + intptr_t working_space_v = reinterpret_cast(_working_space); + if (working_space_v & 0x3f) { + intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f); + working_space_bytes += alignment_offset; + } + strategy strat(_ci); - blockwalker current(*this); + const auto start = work_range.get_position(0); + const auto end = work_range.get_position_end(0); /* Translate 'start' and 'end' into a position within the batches and rows. */ const unsigned int window_per_batch = _Mround / strategy::out_height(); unsigned int batch_0 = start / window_per_batch; unsigned int batch_end = end / window_per_batch; - /* Compute the M values to operate on */ - unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height(); - unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height(); + // In ThreadColumns mode, process work one horizontal strip at a time. + // Transpose the block of needed rows at the start, then do all the work on that block. + if (_thread_columns) { + const auto start_x = work_range.get_position(1) * strategy::out_width(); + const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize); - /* Make sure we've been set up correctly. */ - assert(_B_transposed); - assert(_working_space); - int8_t *working_space_bytes = reinterpret_cast(_working_space); + Tri * const c_panel = reinterpret_cast(working_space_bytes + (threadid * get_c_working_size())); + Toi * const a_panel = reinterpret_cast(working_space_bytes + (_maxthreads * get_c_working_size()) + + (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height())); - // Private buffers. Treat working_space as an array of C buffers - // (one per thread) first, followed by the (window-divided) A - // buffer. - // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later. - Toi * const a_panel = reinterpret_cast(working_space_bytes + (_maxthreads * get_c_working_size())); - Tri * const c_panel = reinterpret_cast(working_space_bytes + (threadid * get_c_working_size())); + for (unsigned int multi=0; multi<_nmulti; multi++) { + for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) { + unsigned int kmax=std::min(k0+_k_block, _Ktotal); - const Toi *b_panel; - b_panel = _B_transposed; + unsigned int rounded_width = roundup(_Nsize, strategy::out_width()); - //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block); + const bool first_pass = (k0==0); + const bool last_pass = (kmax==_Ktotal); - // newkblock() is always true on the first iteration, so this will be set properly on the first loop. - int kern_k = 0; + // Figure out how many "K" the kernel will actually process. + unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll()); - for (;!current.done();current.advance()) { - if (current.newkblock()) { -#ifdef CYCLE_PROFILING - auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi)); -#endif - for (unsigned int batch = batch_0; batch <= batch_end; batch++) { - unsigned int first_m = (batch == batch_0) ? m_0 : 0; - unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k); - if (first_m >= last_m) - continue; + unsigned int batch = batch_0; + unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height(); - strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block), - this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride), - this->_lda, first_m, last_m, current.k0(), current.kmax()); - } + for (unsigned int p=start; p class: this extracts either 'transforms' or + // 'transforms_quantized' as appropriate. + typename transform_type::value>::type transforms; + + if (_indirect_buf != nullptr) { + transforms.PrepareA_indirect(a_panel, + _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize, + _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier()); + } else if (_convolver) { + transforms.PrepareA_convolution(a_panel, + this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride), + this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier()); + } else { + transforms.PrepareA(a_panel, + this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride), + this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier()); + } + } + + // Perform the kernel and merge step, either separately or together as required. + kernel_and_merge::run( + #ifdef CYCLE_PROFILING + prof, + #endif + // Strategy and panel pointers + strat, a_panel, b_ptr, c_panel, + // Result buffer pointers + this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc, + // K size, and M/N ranges + kern_k, start_row, end_row, start_x, end_x, + // Only do bias on the first pass + ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr), + // Only do activation on the last pass, and accumulation on any non-first pass. + (last_pass ? _act : Activation()), !first_pass, + // Pass in quantization parameters for requantizing kernels (others will ignore) + _os, col_bias + (multi * _Nsize), + // Accumulation buffer (not yet implemented on this path) + static_cast(nullptr)); + + /* Increment to the next block */ + start_row += strategy::out_height(); + if (start_row >= _Msize) { + start_row = 0; + batch++; + } + } + } } + } else { + blockwalker current(*this); + + /* Compute the M values to operate on */ + unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height(); + unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height(); + + // Private buffers. Treat working_space as an array of C buffers + // (one per thread) first, followed by the (window-divided) A + // buffer. + // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later. + Toi * const a_panel = reinterpret_cast(working_space_bytes + (_maxthreads * get_c_working_size())); + Tri * const c_panel = reinterpret_cast(working_space_bytes + (threadid * get_c_working_size())); - int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + const Toi *b_panel; + b_panel = _B_transposed; - /* Do the actual work. */ - for (unsigned int batch = batch_0; batch <= batch_end; batch++) { - unsigned int first_m = (batch == batch_0) ? m_0 : 0; - unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + // newkblock() is always true on the first iteration, so these will be set properly on the first loop. - const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; + // kern_k tracks the accumulation depth for the CURRENT K block a_panel_stride similarly tracks the total + // stride of the A panel (i.e. with 4 added for cases with embedded row sums) - if (first_m >= last_m) - continue; + // These are distinct from k_block and get_total_k_depth() which are based on the target K block size, and + // used for addressing inside a_panel. - for (unsigned int y=first_m; y class: this extracts either 'transforms' or + // 'transforms_quantized' as appropriate. + typename transform_type::value>::type transforms; + + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + if (first_m >= last_m) + continue; + + if (_indirect_buf != nullptr) { + transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()), + _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize, + _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier()); + } else if (_convolver) { + transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()), + this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride), + this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier()); + } else { + transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()), + this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride), + this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier()); + } + } + + // Figure out how many "K" the kernel will actually process. + kern_k = roundup(current.kmax() - current.k0(), strategy::k_unroll()); - strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + // Requantizing GEMMs have the row sums built in to the + // transposed data, so the stride between rows is 4 bytes + // larger than the (rounded) K value. - a_ptr += (strategy::out_height() * kern_k); + if(std::is_same::value) { + a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi)); + } else { + a_panel_stride = kern_k; } + } - { -#ifdef CYCLE_PROFILING - auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr))); -#endif - /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */ - const bool first_pass = current.k0()==0; - const bool last_pass = current.kmax()==_Ksize; - - strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride), - c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(), - ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr), - (last_pass ? _act : Activation()), !first_pass); + /* Do the actual work. */ + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth(); + + if (first_m >= last_m) + continue; + + // For the merge case we need to do this out_height() rows + // at a time, as that is the size of our intermediate + // buffer. If we are not doing that, we can do all the + // relevant rows in one go. + unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m); + + // But in the case where we have an accumulation buffer, we can't do that after all, unless + // there is no N blocking. + if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) { + m_step = strategy::out_height(); + } + + for (unsigned int y=first_m; y_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride); + + // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel + // to write things into the accumulation buffer instead, except on the last pass. + if (_accumulation_buffer && !last_pass) { + result_ptr = nullptr; + } + + // Perform the kernel and merge step, either separately or together as required. + kernel_and_merge::run( + #ifdef CYCLE_PROFILING + prof, + #endif + // Strategy and panel pointers + strat, a_ptr, b_panel, c_panel, + // Result buffer pointers + result_ptr, this->_ldc, + // K size, and M/N ranges + kern_k, y, ymax, current.x0(), current.xmax(), + // Only do bias on the first pass + ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr), + // Only do activation on the last pass, and accumulation on any non-first pass. + (last_pass ? _act : Activation()), !first_pass, + // Pass in quantization parameters for requantizing kernels (others will ignore) + _os, col_bias + (current.multi() * _Nsize), + // Accumulation buffer + get_accumulation_buffer(y, current.x0(), batch, current.multi()) ); + + a_ptr += (strategy::out_height() * a_panel_stride); } } - } - b_panel += (bblocks * strat.out_width() * kern_k); + b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k); + } } } // Interface implementation - working space size_t get_working_size() const override { - // In all cases, we need one A buffer plus a C buffer per thread. - size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads); + // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer. + size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size(); - size += 64; // Add on a cache line extra for alignment. + size += 128; // Add on two cache lines extra for alignment. return size; } @@ -362,9 +890,22 @@ public: } working_space_bytes += diff; + working_space_int += diff; // Pretransposed case: just set internal pointer to parameter value. _working_space = reinterpret_cast(working_space_bytes); + + // Set up accumulation buffer + if (get_accumulation_buffer_size() > 0) { + intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads); + // Make sure the accumulation buffer is aligned (needed if the other blocks are not a multiple of cache line length) + if (acc_buff_int & 0x3F) { + acc_buff_int += (0x40 - (acc_buff_int & 0x3F)); + } + _accumulation_buffer = reinterpret_cast(acc_buff_int); + } else { + _accumulation_buffer = nullptr; + } } // Interface implementation - pretransposed @@ -376,56 +917,105 @@ public: return (_B_transposed==nullptr); } - // TODO: this could almost certainly be considerably simpler. size_t get_B_pretransposed_array_size() const override { - size_t total=0; - blockwalker current(*this); + unsigned int x_size = roundup(_Nsize, strategy::out_width()); - do { - /* Figure out the size of each block. */ - unsigned int x_size = (current.xmax() - current.x0()); - unsigned int k_size = (current.kmax() - current.k0()); + return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size(); + } - /* Round sizes up as needed. */ - x_size = iceildiv(x_size, strategy::out_width()); - x_size *= strategy::out_width(); + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + if (std::is_same::value) { + col_bias = reinterpret_cast(in_buffer); - k_size = iceildiv(k_size, strategy::k_unroll()); - k_size *= strategy::k_unroll(); + Requantize32 *qp_ptr = reinterpret_cast(&_os); - total += x_size * k_size * sizeof(Toi); - } while (current.advance()); + for (unsigned int i=0; i<_nmulti; i++) { + // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size. + compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0); + } + } - return total; - } + // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 + uintptr_t buffer_int = reinterpret_cast(in_buffer); + Toi *buffer = reinterpret_cast(buffer_int + get_col_sum_size()); + _B_transposed = buffer; - void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { blockwalker current(*this); - Toi *buffer = reinterpret_cast(in_buffer); - _B_transposed = buffer; strategy strat(_ci); do { /* Figure out the size of each block. */ - unsigned int x_size = (current.xmax() - current.x0()); unsigned int k_size = (current.kmax() - current.k0()); - /* Round sizes up as needed. */ - x_size = iceildiv(x_size, strategy::out_width()); - x_size *= strategy::out_width(); + // We need to insert padding at the end of each K section. + // The computation needed is a little delicate - the coordinates from the block walker are expressed in + // terms of the full, padded, _Ktotal. + // But we need to transform each section with reference to the original, unpadded, input, letting the + // transform pad each section as needed. + + // This is needed for computations below. + const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll()); + + // The expected output format is also an entire columns interleaved, then the next set of + // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at + // a time. + for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){ + unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax()); + + // Track where we are and how much work is left. + unsigned int kpos = current.k0(); + unsigned int kleft = k_size; + + while (kleft) { + // Which section are we in? Based on the rounded-up section size. + unsigned int k_section_base = kpos / rounded_section_size; + // How far into the section are we? + unsigned int k_offset = kpos - (k_section_base * rounded_section_size); + + // We will either copy the rest of this section, or to the end of the requested length. + unsigned int k_length = std::min(_Ksize - k_offset, kleft); + + strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, + x0, xmax, + (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. + (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. - k_size = iceildiv(k_size, strategy::k_unroll()); - k_size *= strategy::k_unroll(); + // We need to modify our position based on the ROUNDED version of what we just did. + unsigned int padded_length = roundup(k_length, strategy::k_unroll()); - strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, - current.x0(), current.xmax(), current.k0(), current.kmax()); + buffer += strategy::out_width() * padded_length; - buffer += (x_size * k_size); + kpos += padded_length; + kleft -= padded_length; + } + } } while (current.advance()); } void set_pretransposed_B_data(void *in_buffer) override { - _B_transposed = reinterpret_cast(in_buffer); + // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 + uintptr_t buffer_int = reinterpret_cast(in_buffer); + _B_transposed = reinterpret_cast(buffer_int + get_col_sum_size()); + col_bias = reinterpret_cast(in_buffer); + } + + void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override { + if (std::is_same::value) { + Requantize32 *qp = reinterpret_cast(&_os); + + qp->bias = bias; + qp->bias_multi_stride = bias_multi_stride; + } + } + + void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override { + assert(string_len == _Ksize); + _indirect_buf = ptr; + } + + void set_convolution_parameters(ConvolutionParameters parms) override { + assert(parms.input_channels == _Ksize); + _convolver = std::unique_ptr>(new convolver(parms)); } // Estimate cycles for given problem given provided parameters @@ -454,4 +1044,14 @@ public: } }; +// Aliases for the variations +template +using GemmInterleavedNoMerge = GemmInterleaved; + +template +using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved; + +template +using GemmInterleavedQuantized = GemmInterleaved; + } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp index bdccd05326..b71f390ab9 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp @@ -250,7 +250,8 @@ class GemmInterleavedPretransposed2d : public GemmCommon { first_m, last_m, current.k0(), - current.kmax()); + current.kmax(), + 0); } } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp index 04cac6095c..05c5116bf3 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp @@ -25,68 +25,151 @@ #include "arm_gemm.hpp" -#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp" -#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp" -#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp" -#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp" -#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp" +#include "kernels/a64_gemm_s16_8x12.hpp" +#include "kernels/a64_gemm_s8_4x4.hpp" +#include "kernels/a64_gemm_s8_8x12.hpp" +#include "kernels/a64_hybrid_s8qa_dot_4x16.hpp" +#include "kernels/a64_hybrid_s8qs_dot_6x16.hpp" +#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp" +#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp" +#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp" +#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp" +#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_s8qa_dot_4x4VL.hpp" +#include "kernels/sve_hybrid_s8qs_dot_6x4VL.hpp" +#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp" +#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp" +#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp" + +#include "gemm_hybrid_indirect.hpp" #include "gemm_hybrid_quantized.hpp" +#include "gemm_hybrid_quantized_inline.hpp" +#include "gemm_interleaved.hpp" #include "quantize_wrapper.hpp" +#include "utils.hpp" namespace arm_gemm { static const GemmImplementation gemm_qint8_methods[] = { #ifdef __ARM_FEATURE_SVE +#ifdef MMLA_INT8 { - GemmMethod::GEMM_HYBRID_QUANTIZED, - "smallK_hybrid_s8s32_dot_1VLx8", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; }, + GemmMethod::GEMM_INTERLEAVED, + "sve_interleaved_s8s32_mmla_8x3VL", + [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, nullptr, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, +#endif { GemmMethod::GEMM_HYBRID_QUANTIZED, - "hybrid_s8s32_dot_4VLx4", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; }, - [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + "sve_smallK_hybrid_s8s32_dot_8x1VL", + [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } +}, +#ifdef SVE2 +{ + GemmMethod::GEMM_HYBRID, + "sve_hybrid_s8qs_dot_6x4VL", + [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_HYBRID, + "sve_hybrid_s8qa_dot_4x4VL", + [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } }, #endif { - GemmMethod::GEMM_HYBRID_QUANTIZED, - "smallK_hybrid_s8s32_dot_4x8", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); }, + GemmMethod::GEMM_HYBRID, + "sve_hybrid_s8s32_dot_6x4VL", + nullptr, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "sve_interleaved_s8s32_dot_8x3VL", + [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); }, nullptr, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, +#endif // SVE +#ifdef MMLA_INT8 +{ + GemmMethod::GEMM_INTERLEAVED, + "a64_interleaved_s8s32_mmla_8x12", + [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } +}, +#endif { GemmMethod::GEMM_HYBRID_QUANTIZED, - "smallK_hybrid_s8s32_dot_4x6", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); }, + "a64_smallK_hybrid_s8s32_dot_8x4", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; }, nullptr, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, { GemmMethod::GEMM_HYBRID_QUANTIZED, - "hybrid_s8s32_dot_16x4", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; }, - [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; }, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + "a64_smallK_hybrid_s8s32_dot_6x4", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, -/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */ { - GemmMethod::QUANTIZE_WRAPPER_2D, - "quantized_wrapper_2d", + GemmMethod::GEMM_INTERLEAVED, + "a64_gemm_s16_8x12", nullptr, - [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);}, - [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper(args, qp); } + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } +}, +{ + GemmMethod::GEMM_HYBRID, + "a64_hybrid_s8qs_dot_6x16", + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_HYBRID, + "a64_hybrid_s8qa_dot_4x16", + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_HYBRID, + "a64_hybrid_s8s32_dot_6x16", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "a64_gemm_s8_8x12", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "a64_gemm_s8_4x4", + nullptr, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, { GemmMethod::QUANTIZE_WRAPPER, "quantized_wrapper", - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; }, nullptr, [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper(args, qp); } }, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp index 0125f9c5db..7342fda5d1 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp @@ -25,13 +25,25 @@ #include "arm_gemm.hpp" -#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp" -#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp" -#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp" -#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp" -#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp" +#include "kernels/a64_gemm_u16_8x12.hpp" +#include "kernels/a64_gemm_u8_4x4.hpp" +#include "kernels/a64_gemm_u8_8x12.hpp" +#include "kernels/a64_hybrid_u8qa_dot_4x16.hpp" +#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp" +#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp" +#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp" +#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp" +#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp" +#include "kernels/sve_hybrid_u8qa_dot_4x4VL.hpp" +#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp" +#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp" +#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp" + +#include "gemm_hybrid_indirect.hpp" #include "gemm_hybrid_quantized.hpp" +#include "gemm_hybrid_quantized_inline.hpp" +#include "gemm_interleaved.hpp" #include "quantize_wrapper.hpp" namespace arm_gemm { @@ -39,54 +51,108 @@ namespace arm_gemm { static const GemmImplementation gemm_quint8_methods[] = { #ifdef __ARM_FEATURE_SVE +#ifdef MMLA_INT8 { - GemmMethod::GEMM_HYBRID_QUANTIZED, - "smallK_hybrid_u8u32_dot_1VLx8", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; }, + GemmMethod::GEMM_INTERLEAVED, + "sve_interleaved_u8u32_mmla_8x3VL", + [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, nullptr, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, +#endif { GemmMethod::GEMM_HYBRID_QUANTIZED, - "hybrid_u8u32_dot_4VLx4", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; }, - [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + "sve_smallK_hybrid_u8u32_dot_8x1VL", + [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } +}, +#ifdef SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL) +{ + GemmMethod::GEMM_HYBRID, + "sve_hybrid_u8qa_dot_4x4VL", + [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } }, #endif { - GemmMethod::GEMM_HYBRID_QUANTIZED, - "smallK_hybrid_u8u32_dot_4x8", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); }, + GemmMethod::GEMM_HYBRID, + "sve_hybrid_u8u32_dot_6x4VL", nullptr, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "sve_interleaved_u8u32_dot_8x3VL", + [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, +#endif +#ifdef MMLA_INT8 +{ + GemmMethod::GEMM_INTERLEAVED, + "a64_interleaved_u8u32_mmla_8x12", + [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } +}, +#endif { GemmMethod::GEMM_HYBRID_QUANTIZED, - "smallK_hybrid_u8u32_dot_4x6", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); }, + "a64_smallK_hybrid_u8u32_dot_8x4", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; }, nullptr, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, { GemmMethod::GEMM_HYBRID_QUANTIZED, - "hybrid_u8u32_dot_16x4", - [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; }, - [](const GemmArgs &args, const Requantize32 &) { return ((args._Nsize<=256) && (args._Ksize>128)) || (args._maxthreads >= 8); }, - [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } + "a64_smallK_hybrid_u8u32_dot_6x4", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, -/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */ { - GemmMethod::QUANTIZE_WRAPPER_2D, - "quantized_wrapper_2d", + GemmMethod::GEMM_INTERLEAVED, + "a64_gemm_u16_8x12", nullptr, - [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);}, - [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper(args, qp); } + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); }, +}, +{ + GemmMethod::GEMM_HYBRID, + "a64_hybrid_u8qa_dot_4x16", + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_HYBRID, + "a64_hybrid_u8u32_dot_6x16", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, + [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "a64_gemm_u8_8x12", + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); }, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "a64_gemm_u8_4x4", + nullptr, + nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, { GemmMethod::QUANTIZE_WRAPPER, "quantized_wrapper", - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; }, nullptr, [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper(args, qp); } }, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp index 5e06443e19..10a35e7a11 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp @@ -28,17 +28,17 @@ #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" -#include "kernels/a64_gemm_u16_12x8.hpp" +#include "kernels/a64_gemm_u16_8x12.hpp" namespace arm_gemm { static const GemmImplementation gemm_u16_methods[] = { { GemmMethod::GEMM_INTERLEAVED, - "gemm_u16_12x8", + "a64_gemm_u16_8x12", nullptr, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, { GemmMethod::DEFAULT, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index 06e68cbc43..c300b8cdf9 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -29,18 +29,20 @@ #include "gemm_interleaved.hpp" #include "gemm_interleaved_pretransposed_2d.hpp" #include "gemm_hybrid.hpp" +#include "gemm_hybrid_indirect.hpp" -#include "kernels/a64_gemm_u16_12x8.hpp" -#include "kernels/a64_gemm_u8_12x8.hpp" +#include "kernels/a64_gemm_u16_8x12.hpp" #include "kernels/a64_gemm_u8_4x4.hpp" -#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp" -#include "kernels/a64_interleaved_u8u32_mmla_12x8.hpp" -#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp" -#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp" -#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp" -#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp" -#include "kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp" -#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp" +#include "kernels/a64_gemm_u8_8x12.hpp" +#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp" +#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp" +#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp" +#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp" + +#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp" +#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp" +#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp" +#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp" namespace arm_gemm { @@ -49,106 +51,84 @@ static const GemmImplementation gemm_u8_methods[] = { #ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, - "interleaved_u8u32_mmla_3VLx8", + "sve_interleaved_u8u32_mmla_8x3VL", [](const GemmArgs &args) { return (args._Ksize>8); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_u8u32_dot_1VLx8", - [](const GemmArgs &args) { return args._Ksize<=64; }, + "smallK_hybrid_u8u32_dot_8x1VL", + [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, - "hybrid_u8u32_dot_4VLx4", - [](const GemmArgs &args) { return args._Ksize>=16; }, + "sve_hybrid_u8u32_dot_6x4VL", + nullptr, [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "interleaved_u8u32_dot_3VLx8", + "sve_interleaved_u8u32_dot_8x3VL", [](const GemmArgs &args) { return (args._Ksize>4); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif #ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, - "interleaved_u8u32_mmla_12x8", + "a64_interleaved_u8u32_mmla_8x12", [](const GemmArgs &args) { return (args._Ksize>8); }, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_u8u32_dot_4x8", - [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); }, + "a64_smallK_hybrid_u8u32_dot_8x4", + [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, - "smallK_hybrid_u8u32_dot_4x6", - [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); }, + "a64_smallK_hybrid_u8u32_dot_6x4", + [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; }, nullptr, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, -{ - GemmMethod::GEMM_HYBRID, - "hybrid_u8u32_dot_16x4", - [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; }, - [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, - [](const GemmArgs &args) { return new GemmHybrid(args); } -}, -{ - GemmMethod::GEMM_INTERLEAVED_2D, - "gemm_u8_12x8_2d", - [](const GemmArgs &args) { return args._ci->has_dotprod(); }, - [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8) ; }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } + [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "gemm_u8_12x8_1d", - [](const GemmArgs &args) { return args._ci->has_dotprod(); }, + "a64_gemm_u16_8x12", nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; }, + [](const GemmArgs &args) { return new GemmInterleaved(args); }, }, { - GemmMethod::GEMM_INTERLEAVED_2D, - "gemm_u16_12x8_2d", - nullptr, - [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4 && (args._Msize / args._maxthreads) < 8; }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); }, + GemmMethod::GEMM_HYBRID, + "a64_hybrid_u8u32_dot_6x16", + [](const GemmArgs &args) { return args._ci->has_dotprod(); }, + [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "gemm_u16_12x8_1d", - nullptr, - [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; }, - [](const GemmArgs &args) { return new GemmInterleaved(args); }, -}, -{ - GemmMethod::GEMM_INTERLEAVED_2D, - "gemm_u8_4x4_2d", + "a64_gemm_u8_8x12", + [](const GemmArgs &args) { return args._ci->has_dotprod(); }, nullptr, - [](const GemmArgs &args) { return ((args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8)) || - ((args._Msize / args._maxthreads) < 4); }, - [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, { GemmMethod::GEMM_INTERLEAVED, - "gemm_u8_4x4_1d", + "a64_gemm_u8_4x4", nullptr, nullptr, - [](const GemmArgs &args) { return new GemmInterleaved(args); } + [](const GemmArgs &args) { return new GemmInterleaved(args); } }, { GemmMethod::DEFAULT, diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 47909cdaeb..9de44fcb73 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -46,46 +46,39 @@ class GemvPretransposed : public GemmCommon { typedef typename strategy::operand_type Toi; typedef typename strategy::result_type Tri; - const unsigned int _Nsize; - const unsigned int _Ksize; - - const unsigned int _nmultis; - - const Activation _act; - - const CPUInfo * const _ci; + const GemmArgs _args; const unsigned int _buffer_per_multi; - unsigned int m_block=0; + unsigned int k_block=0; unsigned int n_block=0; - const Toi *_A_pretransposed = nullptr; + const Toi *_B_pretransposed = nullptr; public: GemvPretransposed(GemvPretransposed &) = delete; GemvPretransposed & operator= (GemvPretransposed &) = delete; GemvPretransposed(const GemmArgs &args) - : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _act(args._act), _ci(args._ci), - _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) { + : _args(args), + _buffer_per_multi(args._Ksize * roundup(args._Nsize, strategy::out_width())) { /* For now don't do any blocking. TODO: figure out if we should. */ - if (args._cfg && args._cfg->inner_block_size) { - m_block = args._cfg->inner_block_size; + if (strategy::supports_accumulate() && args._cfg && args._cfg->inner_block_size) { + k_block = args._cfg->inner_block_size; } else { - m_block = _Ksize; + k_block = args._Ksize; } if (args._cfg && args._cfg->outer_block_size) { n_block = args._cfg->outer_block_size; } else { - n_block = _Nsize; + n_block = args._Nsize; } } // Window is number of out_width blocks, times number of multis. ndrange_t get_window_size() const override { - return { iceildiv(_Nsize, strategy::out_width()) * _nmultis }; + return { iceildiv(_args._Nsize, strategy::out_width()) * _args._nmulti }; } // Actually execute the GEMV. @@ -93,13 +86,13 @@ public: #ifdef CYCLE_PROFILING profiler prof; #endif - strategy strat(_ci); + strategy strat(_args._ci); const auto start = work_range.get_position(0); const auto end = work_range.get_position_end(0); /* Break the window values down into multis of interest... */ - const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width()); + const unsigned int window_per_multi = iceildiv(_args._Nsize, strategy::out_width()); const unsigned int multi_0 = start / window_per_multi; const unsigned int multi_end = end / window_per_multi; @@ -111,36 +104,25 @@ public: for (unsigned int multi=multi_0; multi<=multi_end; multi++) { const unsigned int n_start = (multi==multi_0) ? n_0 : 0; - const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize; + const unsigned int n_end = (multi==multi_end) ? n_max : _args._Nsize; if (n_end <= n_start) continue; - for (unsigned int m0=0; m0<_Ksize; m0+=m_block) { - unsigned int mmax = std::min(m0 + m_block, _Ksize); + for (unsigned int k0=0; k0<_args._Ksize; k0+=k_block) { + unsigned int kmax = std::min(k0 + k_block, _args._Ksize); for (unsigned int n=n_start; n_Bptr below instead */ - strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()), - (_Ksize * strategy::A_interleave()), - this->_Aptr + (multi * this->_A_multi_stride) + m0, + strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0, + _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()), this->_Cptr + (multi * this->_C_multi_stride) + n, - static_cast(0), (mmax-m0), (nmax-n)); - - // Handle activation separately for now - if (this->_bias) { - activator(this->_Cptr + (multi * this->_C_multi_stride) + n, 0, - this->_bias + (multi * this->_bias_multi_stride) + n, - _act, 1, (nmax-n)); - } else { - activator(this->_Cptr + (multi * this->_C_multi_stride) + n, 0, - static_cast(nullptr), - _act, 1, (nmax-n)); - } + (nmax - n), (kmax-k0), + this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr, + _args._act, (k0 != 0)); } } } @@ -152,33 +134,27 @@ public: } bool B_pretranspose_required() const override { - /* Transpose is required if _A_pretransposed is still nullptr */ - return (_A_pretransposed == nullptr); + /* Transpose is required if _B_pretransposed is still nullptr */ + return (_B_pretransposed == nullptr); } size_t get_B_pretransposed_array_size() const override { - return _buffer_per_multi * _nmultis * sizeof(To); + return _buffer_per_multi * _args._nmulti * sizeof(To); } void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override { - Toi *A_buffer = reinterpret_cast(buffer); - - for (unsigned int multi=0; multi<_nmultis; multi++) { - /* Reverse sense here as we are dealing with B rather than A. So if - * strategy::A_transpose is false and _trB is false, we still - * transpose. */ - if (strategy::A_transpose()) { - Transform(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); - } else { - Transform(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); - } + Toi *B_buffer = reinterpret_cast(buffer); + strategy strat(_args._ci); + + for (unsigned int multi=0; multi<_args._nmulti; multi++) { + strat.transforms.PrepareB(B_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _args._Nsize, 0, _args._Ksize); } - _A_pretransposed = A_buffer; + _B_pretransposed = B_buffer; } void set_pretransposed_B_data(void *buffer) override { - _A_pretransposed = reinterpret_cast(buffer); + _B_pretransposed = reinterpret_cast(buffer); } }; diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp new file mode 100644 index 0000000000..807511f0d2 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __arm__ + +#include + +#include "../asmlib.hpp" + +template<> +void interleave_block<6, 1, VLType::None, false>( + float * &outptr, const float * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + const float *inptr0 = in[0] + row_offset; + const float *inptr1 = in[1] + row_offset; + const float *inptr2 = in[2] + row_offset; + const float *inptr3 = in[3] + row_offset; + const float *inptr4 = in[4] + row_offset; + const float *inptr5 = in[5] + row_offset; + + // Cope with ragged cases by aliasing the first row (which is always valid). + // The nonsense output produced will be suppressed later anyway. + switch (height) { + case 1: + inptr1 = inptr0; + // fall through + case 2: + inptr2 = inptr0; + // fall through + case 3: + inptr3 = inptr0; + // fall through + case 4: + inptr4 = inptr0; + // fall through + case 5: + inptr5 = inptr0; + // fall through + default: + case 6: + break; + } + + //prefetch_2x(inptr0); + //prefetch_2x(inptr1); + //prefetch_2x(inptr2); + //prefetch_2x(inptr3); + //prefetch_2x(inptr4); + //prefetch_2x(inptr5); + + for (;width>7;width-=8) { + __asm __volatile ( + // Load up 8 elements (2 vectors) from each of 8 sources. + "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3 + "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3 + "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3 + "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3 + "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3 + "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3 + "VLD1.32 {d16-d19}, [%[inptr4]]!\n" + "VLD1.32 {d20-d23}, [%[inptr5]]!\n" + "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3 + ASM_PREFETCH("[%[inptr0], #128]") + "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1 + + // Store first elements + "VST1.32 {d0-d1}, [%[outptr]]!\n" + "VST1.32 {d16}, [%[outptr]]!\n" + + "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3 + + // Store second elements + "VST1.32 {d4-d5}, [%[outptr]]!\n" + "VZIP.32 q1, q5\n" + ASM_PREFETCH("[%[inptr1], #128]") + "VST1.32 {d17}, [%[outptr]]!\n" + "VZIP.32 q3, q7\n" + + // Store third elements + "VZIP.32 q9, q11\n" + "VST1.32 {d8-d9}, [%[outptr]]!\n" + "VZIP.32 q1, q3\n" + ASM_PREFETCH("[%[inptr2], #128]") + "VST1.32 {d20}, [%[outptr]]!\n" + + // Store fourth elements + "VZIP.32 q5, q7\n" + "VST1.32 {d12-d13}, [%[outptr]]!\n" + ASM_PREFETCH("[%[inptr3], #128]") + "VST1.32 {d21}, [%[outptr]]!\n" + + // Fifth + "VST1.32 {d2-d3}, [%[outptr]]!\n" + ASM_PREFETCH("[%[inptr4], #128]") + "VST1.32 {d18}, [%[outptr]]!\n" + + // Sixth + "VST1.32 {d6-d7}, [%[outptr]]!\n" + ASM_PREFETCH("[%[inptr5], #128]") + "VST1.32 {d19}, [%[outptr]]!\n" + + // Seventh + "VST1.32 {d10-d11}, [%[outptr]]!\n" + "VST1.32 {d22}, [%[outptr]]!\n" + + // Eighth + "VST1.32 {d14-d15}, [%[outptr]]!\n" + "VST1.32 {d23}, [%[outptr]]!\n" + + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory" + ); + } + + for (;width>0;width--) { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + } +} + +#endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp new file mode 100644 index 0000000000..8054c2b96b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<4, 16, VLType::None, false>( + int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x22, [%x[in], #0x0]\n" + "cmp %x[height], #0x4\n" + "ldr x21, [%x[in], #0x8]\n" + "add x22, x22, %x[row_offset]\n" + "ldr x20, [%x[in], #0x10]\n" + "ldr x19, [%x[in], #0x18]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "add x19, x19, %x[row_offset]\n" + "beq 1f\n" + "mov x19, x22\n" + "cmp %x[height], #0x2\n" + "csel x21, x21, x22, GE\n" + "csel x20, x20, x22, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x22, #0x0]\n" + "cmp %x[width], #0x10\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x19, #0x0]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "prfm pldl1keep, [x19, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q19, [x22], #0x10\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q18, [x21], #0x10\n" + "ldr q17, [x20], #0x10\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x19], #0x10\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q19, [%x[out_ptr], #0x0]\n" + "str q18, [%x[out_ptr], #0x10]\n" + "prfm pldl1keep, [x19, #0x70]\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "subs %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 12f\n" + "tbz %x[width], #3, 7f\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" + "ldr d17, [x20], #0x8\n" + "ldr d16, [x19], #0x8\n" + "tbz %x[width], #2, 5f\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" + "ld1 { v17.s }[2], [x20], #0x4\n" + "ld1 { v16.s }[2], [x19], #0x4\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "ld1 { v18.h }[6], [x21], #0x2\n" + "ld1 { v17.h }[6], [x20], #0x2\n" + "ld1 { v16.h }[6], [x19], #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v19.b }[14], [x22]\n" + "ld1 { v18.b }[14], [x21]\n" + "ld1 { v17.b }[14], [x20]\n" + "ld1 { v16.b }[14], [x19]\n" + "b 11f\n" + "4:" // odd_loads_1_12 + "tbz %x[width], #0, 11f\n" + "ld1 { v19.b }[12], [x22]\n" + "ld1 { v18.b }[12], [x21]\n" + "ld1 { v17.b }[12], [x20]\n" + "ld1 { v16.b }[12], [x19]\n" + "b 11f\n" + "5:" // odd_loads_2_8 + "tbz %x[width], #1, 6f\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "ld1 { v18.h }[4], [x21], #0x2\n" + "ld1 { v17.h }[4], [x20], #0x2\n" + "ld1 { v16.h }[4], [x19], #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v19.b }[10], [x22]\n" + "ld1 { v18.b }[10], [x21]\n" + "ld1 { v17.b }[10], [x20]\n" + "ld1 { v16.b }[10], [x19]\n" + "b 11f\n" + "6:" // odd_loads_1_8 + "tbz %x[width], #0, 11f\n" + "ld1 { v19.b }[8], [x22]\n" + "ld1 { v18.b }[8], [x21]\n" + "ld1 { v17.b }[8], [x20]\n" + "ld1 { v16.b }[8], [x19]\n" + "b 11f\n" + "7:" // odd_loads_4_0 + "tbz %x[width], #2, 9f\n" + "ldr s19, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" + "ldr s17, [x20], #0x4\n" + "ldr s16, [x19], #0x4\n" + "tbz %x[width], #1, 8f\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "ld1 { v18.h }[2], [x21], #0x2\n" + "ld1 { v17.h }[2], [x20], #0x2\n" + "ld1 { v16.h }[2], [x19], #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v19.b }[6], [x22]\n" + "ld1 { v18.b }[6], [x21]\n" + "ld1 { v17.b }[6], [x20]\n" + "ld1 { v16.b }[6], [x19]\n" + "b 11f\n" + "8:" // odd_loads_1_4 + "tbz %x[width], #0, 11f\n" + "ld1 { v19.b }[4], [x22]\n" + "ld1 { v18.b }[4], [x21]\n" + "ld1 { v17.b }[4], [x20]\n" + "ld1 { v16.b }[4], [x19]\n" + "b 11f\n" + "9:" // odd_loads_2_0 + "tbz %x[width], #1, 10f\n" + "ldr h19, [x22], #0x2\n" + "ldr h18, [x21], #0x2\n" + "ldr h17, [x20], #0x2\n" + "ldr h16, [x19], #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v19.b }[2], [x22]\n" + "ld1 { v18.b }[2], [x21]\n" + "ld1 { v17.b }[2], [x20]\n" + "ld1 { v16.b }[2], [x19]\n" + "b 11f\n" + "10:" // odd_loads_1_0 + "ldr b19, [x22, #0x0]\n" + "ldr b18, [x21, #0x0]\n" + "ldr b17, [x20, #0x0]\n" + "ldr b16, [x19, #0x0]\n" + "11:" // Odd load end + "str q19, [%x[out_ptr], #0x0]\n" + "str q18, [%x[out_ptr], #0x10]\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "12:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22" + ); +} + +template<> +void interleave_block<4, 16, VLType::None, false>( + uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + int8_t * &out_cast = reinterpret_cast(out_ptr); + const int8_t * const * in_cast = reinterpret_cast(in); + + interleave_block<4, 16, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp new file mode 100644 index 0000000000..1650916f9f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<4, 16, VLType::None, true>( + int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v28.8h, #0x0\n" + "ldr x23, [%x[in], #0x0]\n" + "mov x22, #0x0\n" + "movi v27.8h, #0x0\n" + "ldr x21, [%x[in], #0x8]\n" + "cmp %x[height], #0x4\n" + "movi v26.8h, #0x0\n" + "ldr x20, [%x[in], #0x10]\n" + "add x23, x23, %x[row_offset]\n" + "movi v25.8h, #0x0\n" + "ldr x19, [%x[in], #0x18]\n" + "movi v24.4s, #0x0\n" + "add x21, x21, %x[row_offset]\n" + "movi v23.4s, #0x0\n" + "add x20, x20, %x[row_offset]\n" + "movi v22.4s, #0x0\n" + "add x19, x19, %x[row_offset]\n" + "movi v21.4s, #0x0\n" + "beq 1f\n" + "mov x19, x23\n" + "cmp %x[height], #0x2\n" + "csel x21, x21, x23, GE\n" + "csel x20, x20, x23, GT\n" + "1:" // no_pointer_adj + "movi v20.4s, #0x0\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x19, #0x0]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "prfm pldl1keep, [x19, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x10\n" + "ld1 { v20.4s }, [%x[out_ptr]]\n" + "2:" // first_pass + "cmp %x[width], #0x10\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x22, #0x7e\n" + "ble 4f\n" + "sadalp v24.4s, v28.8h\n" + "movi v28.8h, #0x0\n" + "sadalp v23.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "sadalp v22.4s, v26.8h\n" + "movi v26.8h, #0x0\n" + "sadalp v21.4s, v25.8h\n" + "movi v25.8h, #0x0\n" + "mov x22, #0x0\n" + "4:" // no_accumulate_16 + "ldr q19, [x23], #0x10\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q18, [x21], #0x10\n" + "ldr q17, [x20], #0x10\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x19], #0x10\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q19, [%x[out_ptr], #0x0]\n" + "sadalp v28.8h, v19.16b\n" + "prfm pldl1keep, [x19, #0x70]\n" + "str q18, [%x[out_ptr], #0x10]\n" + "sadalp v27.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "sadalp v26.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "sadalp v25.8h, v16.16b\n" + "add x22, x22, #0x1\n" + "subs %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 14f\n" + "tbz %x[width], #3, 9f\n" + "ldr d19, [x23], #0x8\n" + "ldr d18, [x21], #0x8\n" + "ldr d17, [x20], #0x8\n" + "ldr d16, [x19], #0x8\n" + "tbz %x[width], #2, 7f\n" + "ld1 { v19.s }[2], [x23], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" + "ld1 { v17.s }[2], [x20], #0x4\n" + "ld1 { v16.s }[2], [x19], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v19.h }[6], [x23], #0x2\n" + "ld1 { v18.h }[6], [x21], #0x2\n" + "ld1 { v17.h }[6], [x20], #0x2\n" + "ld1 { v16.h }[6], [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[14], [x23]\n" + "ld1 { v18.b }[14], [x21]\n" + "ld1 { v17.b }[14], [x20]\n" + "ld1 { v16.b }[14], [x19]\n" + "b 13f\n" + "6:" // odd_loads_1_12 + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[12], [x23]\n" + "ld1 { v18.b }[12], [x21]\n" + "ld1 { v17.b }[12], [x20]\n" + "ld1 { v16.b }[12], [x19]\n" + "b 13f\n" + "7:" // odd_loads_2_8 + "tbz %x[width], #1, 8f\n" + "ld1 { v19.h }[4], [x23], #0x2\n" + "ld1 { v18.h }[4], [x21], #0x2\n" + "ld1 { v17.h }[4], [x20], #0x2\n" + "ld1 { v16.h }[4], [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[10], [x23]\n" + "ld1 { v18.b }[10], [x21]\n" + "ld1 { v17.b }[10], [x20]\n" + "ld1 { v16.b }[10], [x19]\n" + "b 13f\n" + "8:" // odd_loads_1_8 + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[8], [x23]\n" + "ld1 { v18.b }[8], [x21]\n" + "ld1 { v17.b }[8], [x20]\n" + "ld1 { v16.b }[8], [x19]\n" + "b 13f\n" + "9:" // odd_loads_4_0 + "tbz %x[width], #2, 11f\n" + "ldr s19, [x23], #0x4\n" + "ldr s18, [x21], #0x4\n" + "ldr s17, [x20], #0x4\n" + "ldr s16, [x19], #0x4\n" + "tbz %x[width], #1, 10f\n" + "ld1 { v19.h }[2], [x23], #0x2\n" + "ld1 { v18.h }[2], [x21], #0x2\n" + "ld1 { v17.h }[2], [x20], #0x2\n" + "ld1 { v16.h }[2], [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[6], [x23]\n" + "ld1 { v18.b }[6], [x21]\n" + "ld1 { v17.b }[6], [x20]\n" + "ld1 { v16.b }[6], [x19]\n" + "b 13f\n" + "10:" // odd_loads_1_4 + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[4], [x23]\n" + "ld1 { v18.b }[4], [x21]\n" + "ld1 { v17.b }[4], [x20]\n" + "ld1 { v16.b }[4], [x19]\n" + "b 13f\n" + "11:" // odd_loads_2_0 + "tbz %x[width], #1, 12f\n" + "ldr h19, [x23], #0x2\n" + "ldr h18, [x21], #0x2\n" + "ldr h17, [x20], #0x2\n" + "ldr h16, [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[2], [x23]\n" + "ld1 { v18.b }[2], [x21]\n" + "ld1 { v17.b }[2], [x20]\n" + "ld1 { v16.b }[2], [x19]\n" + "b 13f\n" + "12:" // odd_loads_1_0 + "ldr b19, [x23, #0x0]\n" + "ldr b18, [x21, #0x0]\n" + "ldr b17, [x20, #0x0]\n" + "ldr b16, [x19, #0x0]\n" + "13:" // Odd load end + "str q19, [%x[out_ptr], #0x0]\n" + "sadalp v28.8h, v19.16b\n" + "str q18, [%x[out_ptr], #0x10]\n" + "sadalp v27.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "sadalp v26.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "sadalp v25.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "14:" // Odds skip + "sadalp v24.4s, v28.8h\n" + "sadalp v23.4s, v27.8h\n" + "addp v24.4s, v24.4s, v23.4s\n" + "sadalp v22.4s, v26.8h\n" + "sadalp v21.4s, v25.8h\n" + "addp v23.4s, v22.4s, v21.4s\n" + "addp v24.4s, v24.4s, v23.4s\n" + "add v24.4s, v24.4s, v20.4s\n" + "str q24, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp new file mode 100644 index 0000000000..af3efb25b2 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<4, 16, VLType::None, true>( + uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v28.8h, #0x0\n" + "ldr x23, [%x[in], #0x0]\n" + "mov x22, #0x0\n" + "movi v27.8h, #0x0\n" + "ldr x21, [%x[in], #0x8]\n" + "cmp %x[height], #0x4\n" + "movi v26.8h, #0x0\n" + "ldr x20, [%x[in], #0x10]\n" + "add x23, x23, %x[row_offset]\n" + "movi v25.8h, #0x0\n" + "ldr x19, [%x[in], #0x18]\n" + "movi v24.4s, #0x0\n" + "add x21, x21, %x[row_offset]\n" + "movi v23.4s, #0x0\n" + "add x20, x20, %x[row_offset]\n" + "movi v22.4s, #0x0\n" + "add x19, x19, %x[row_offset]\n" + "movi v21.4s, #0x0\n" + "beq 1f\n" + "mov x19, x23\n" + "cmp %x[height], #0x2\n" + "csel x21, x21, x23, GE\n" + "csel x20, x20, x23, GT\n" + "1:" // no_pointer_adj + "movi v20.4s, #0x0\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x19, #0x0]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "prfm pldl1keep, [x19, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x10\n" + "ld1 { v20.4s }, [%x[out_ptr]]\n" + "2:" // first_pass + "cmp %x[width], #0x10\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x22, #0x7e\n" + "ble 4f\n" + "uadalp v24.4s, v28.8h\n" + "movi v28.8h, #0x0\n" + "uadalp v23.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "uadalp v22.4s, v26.8h\n" + "movi v26.8h, #0x0\n" + "uadalp v21.4s, v25.8h\n" + "movi v25.8h, #0x0\n" + "mov x22, #0x0\n" + "4:" // no_accumulate_16 + "ldr q19, [x23], #0x10\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q18, [x21], #0x10\n" + "ldr q17, [x20], #0x10\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x19], #0x10\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q19, [%x[out_ptr], #0x0]\n" + "uadalp v28.8h, v19.16b\n" + "prfm pldl1keep, [x19, #0x70]\n" + "str q18, [%x[out_ptr], #0x10]\n" + "uadalp v27.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "uadalp v26.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "uadalp v25.8h, v16.16b\n" + "add x22, x22, #0x1\n" + "subs %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 14f\n" + "tbz %x[width], #3, 9f\n" + "ldr d19, [x23], #0x8\n" + "ldr d18, [x21], #0x8\n" + "ldr d17, [x20], #0x8\n" + "ldr d16, [x19], #0x8\n" + "tbz %x[width], #2, 7f\n" + "ld1 { v19.s }[2], [x23], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" + "ld1 { v17.s }[2], [x20], #0x4\n" + "ld1 { v16.s }[2], [x19], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v19.h }[6], [x23], #0x2\n" + "ld1 { v18.h }[6], [x21], #0x2\n" + "ld1 { v17.h }[6], [x20], #0x2\n" + "ld1 { v16.h }[6], [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[14], [x23]\n" + "ld1 { v18.b }[14], [x21]\n" + "ld1 { v17.b }[14], [x20]\n" + "ld1 { v16.b }[14], [x19]\n" + "b 13f\n" + "6:" // odd_loads_1_12 + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[12], [x23]\n" + "ld1 { v18.b }[12], [x21]\n" + "ld1 { v17.b }[12], [x20]\n" + "ld1 { v16.b }[12], [x19]\n" + "b 13f\n" + "7:" // odd_loads_2_8 + "tbz %x[width], #1, 8f\n" + "ld1 { v19.h }[4], [x23], #0x2\n" + "ld1 { v18.h }[4], [x21], #0x2\n" + "ld1 { v17.h }[4], [x20], #0x2\n" + "ld1 { v16.h }[4], [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[10], [x23]\n" + "ld1 { v18.b }[10], [x21]\n" + "ld1 { v17.b }[10], [x20]\n" + "ld1 { v16.b }[10], [x19]\n" + "b 13f\n" + "8:" // odd_loads_1_8 + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[8], [x23]\n" + "ld1 { v18.b }[8], [x21]\n" + "ld1 { v17.b }[8], [x20]\n" + "ld1 { v16.b }[8], [x19]\n" + "b 13f\n" + "9:" // odd_loads_4_0 + "tbz %x[width], #2, 11f\n" + "ldr s19, [x23], #0x4\n" + "ldr s18, [x21], #0x4\n" + "ldr s17, [x20], #0x4\n" + "ldr s16, [x19], #0x4\n" + "tbz %x[width], #1, 10f\n" + "ld1 { v19.h }[2], [x23], #0x2\n" + "ld1 { v18.h }[2], [x21], #0x2\n" + "ld1 { v17.h }[2], [x20], #0x2\n" + "ld1 { v16.h }[2], [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[6], [x23]\n" + "ld1 { v18.b }[6], [x21]\n" + "ld1 { v17.b }[6], [x20]\n" + "ld1 { v16.b }[6], [x19]\n" + "b 13f\n" + "10:" // odd_loads_1_4 + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[4], [x23]\n" + "ld1 { v18.b }[4], [x21]\n" + "ld1 { v17.b }[4], [x20]\n" + "ld1 { v16.b }[4], [x19]\n" + "b 13f\n" + "11:" // odd_loads_2_0 + "tbz %x[width], #1, 12f\n" + "ldr h19, [x23], #0x2\n" + "ldr h18, [x21], #0x2\n" + "ldr h17, [x20], #0x2\n" + "ldr h16, [x19], #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v19.b }[2], [x23]\n" + "ld1 { v18.b }[2], [x21]\n" + "ld1 { v17.b }[2], [x20]\n" + "ld1 { v16.b }[2], [x19]\n" + "b 13f\n" + "12:" // odd_loads_1_0 + "ldr b19, [x23, #0x0]\n" + "ldr b18, [x21, #0x0]\n" + "ldr b17, [x20, #0x0]\n" + "ldr b16, [x19, #0x0]\n" + "13:" // Odd load end + "str q19, [%x[out_ptr], #0x0]\n" + "uadalp v28.8h, v19.16b\n" + "str q18, [%x[out_ptr], #0x10]\n" + "uadalp v27.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "uadalp v26.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "uadalp v25.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "14:" // Odds skip + "uadalp v24.4s, v28.8h\n" + "uadalp v23.4s, v27.8h\n" + "addp v24.4s, v24.4s, v23.4s\n" + "uadalp v22.4s, v26.8h\n" + "uadalp v21.4s, v25.8h\n" + "addp v23.4s, v22.4s, v21.4s\n" + "addp v24.4s, v24.4s, v23.4s\n" + "add v24.4s, v24.4s, v20.4s\n" + "str q24, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp new file mode 100644 index 0000000000..34d25f27b8 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, false>( + float * &out_ptr, const bfloat16 * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "movi v29.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr d28, [x27], #0x8\n" + "zip1 v28.8h, v29.8h, v28.8h\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr d27, [x26], #0x8\n" + "zip1 v27.8h, v29.8h, v27.8h\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr d26, [x25], #0x8\n" + "zip1 v26.8h, v29.8h, v26.8h\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr d25, [x24], #0x8\n" + "zip1 v20.4s, v28.4s, v26.4s\n" + "prfm pldl1keep, [x24, #0x70]\n" + "zip1 v25.8h, v29.8h, v25.8h\n" + "ldr d24, [x23], #0x8\n" + "zip1 v19.4s, v27.4s, v25.4s\n" + "prfm pldl1keep, [x23, #0x70]\n" + "zip1 v24.8h, v29.8h, v24.8h\n" + "ldr d23, [x22], #0x8\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + "prfm pldl1keep, [x22, #0x70]\n" + "zip1 v23.8h, v29.8h, v23.8h\n" + "ldr d22, [x21], #0x8\n" + "zip2 v19.4s, v20.4s, v19.4s\n" + "prfm pldl1keep, [x21, #0x70]\n" + "zip1 v22.8h, v29.8h, v22.8h\n" + "ldr d21, [x20], #0x8\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v21.8h, v29.8h, v21.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v17.4s, v23.4s, v21.4s\n" + "subs %x[width], %x[width], #0x4\n" + "zip2 v20.4s, v28.4s, v26.4s\n" + "cmp %x[width], #0x4\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q19, [%x[out_ptr], #0x20]\n" + "zip2 v19.4s, v27.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + "str q16, [%x[out_ptr], #0x60]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 6f\n" + "tbz %x[width], #1, 4f\n" + "ldr s28, [x27], #0x4\n" + "ldr s27, [x26], #0x4\n" + "ldr s26, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s22, [x21], #0x4\n" + "ldr s21, [x20], #0x4\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 5f\n" + "ld1 { v28.h }[2], [x27]\n" + "ld1 { v27.h }[2], [x26]\n" + "ld1 { v26.h }[2], [x25]\n" + "ld1 { v25.h }[2], [x24]\n" + "ld1 { v24.h }[2], [x23]\n" + "ld1 { v23.h }[2], [x22]\n" + "ld1 { v22.h }[2], [x21]\n" + "ld1 { v21.h }[2], [x20]\n" + "mov x19, #0x3\n" + "b 5f\n" + "4:" // odd_loads_1_0 + "ldr h28, [x27, #0x0]\n" + "ldr h27, [x26, #0x0]\n" + "ldr h26, [x25, #0x0]\n" + "ldr h25, [x24, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "ldr h23, [x22, #0x0]\n" + "ldr h22, [x21, #0x0]\n" + "ldr h21, [x20, #0x0]\n" + "mov x19, #0x1\n" + "5:" // Odd load end + "zip1 v28.8h, v29.8h, v28.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v27.8h, v29.8h, v27.8h\n" + "zip1 v26.8h, v29.8h, v26.8h\n" + "zip1 v25.8h, v29.8h, v25.8h\n" + "zip1 v24.8h, v29.8h, v24.8h\n" + "zip1 v23.8h, v29.8h, v23.8h\n" + "zip1 v22.8h, v29.8h, v22.8h\n" + "zip1 v21.8h, v29.8h, v21.8h\n" + "zip1 v20.4s, v28.4s, v26.4s\n" + "zip1 v19.4s, v27.4s, v25.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 6f\n" + "zip2 v19.4s, v20.4s, v19.4s\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q19, [%x[out_ptr], #0x0]\n" + "str q16, [%x[out_ptr], #0x10]\n" + "subs x19, x19, #0x1\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 6f\n" + "zip2 v20.4s, v28.4s, v26.4s\n" + "zip2 v19.4s, v27.4s, v25.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "6:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp new file mode 100644 index 0000000000..d547957129 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, false>( + __fp16 * &out_ptr, const __fp16 * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q30, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q29, [x26], #0x10\n" + "ldr q28, [x25], #0x10\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q27, [x24], #0x10\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q24, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q25, [x22], #0x10\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q23, [x21], #0x10\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q22, [x20], #0x10\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "prfm pldl1keep, [x21, #0x70]\n" + "subs %x[width], %x[width], #0x8\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "cmp %x[width], #0x8\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 8f\n" + "tbz %x[width], #2, 5f\n" + "ldr d30, [x27], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d22, [x20], #0x8\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x20], #0x4\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.h }[6], [x27]\n" + "ld1 { v29.h }[6], [x26]\n" + "ld1 { v28.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x24]\n" + "ld1 { v24.h }[6], [x23]\n" + "ld1 { v25.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" + "ld1 { v22.h }[6], [x20]\n" + "mov x19, #0x7\n" + "b 7f\n" + "4:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.h }[4], [x27]\n" + "ld1 { v29.h }[4], [x26]\n" + "ld1 { v28.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x24]\n" + "ld1 { v24.h }[4], [x23]\n" + "ld1 { v25.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" + "ld1 { v22.h }[4], [x20]\n" + "mov x19, #0x5\n" + "b 7f\n" + "5:" // odd_loads_2_0 + "tbz %x[width], #1, 6f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" + "ldr s22, [x20], #0x4\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.h }[2], [x27]\n" + "ld1 { v29.h }[2], [x26]\n" + "ld1 { v28.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "ld1 { v24.h }[2], [x23]\n" + "ld1 { v25.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" + "ld1 { v22.h }[2], [x20]\n" + "mov x19, #0x3\n" + "b 7f\n" + "6:" // odd_loads_1_0 + "ldr h30, [x27, #0x0]\n" + "ldr h29, [x26, #0x0]\n" + "ldr h28, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "ldr h25, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" + "ldr h22, [x20, #0x0]\n" + "mov x19, #0x1\n" + "7:" // Odd load end + "zip1 v26.8h, v30.8h, v24.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v20.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "8:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp new file mode 100644 index 0000000000..b45e622a47 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, false>( + float * &out_ptr, const __fp16 * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr d29, [x27], #0x8\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr d26, [x24], #0x8\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr d25, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr d23, [x21], #0x8\n" + "ldr d22, [x20], #0x8\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "fcvtl v29.4s, v29.4h\n" + "fcvtl v28.4s, v28.4h\n" + "prfm pldl1keep, [x21, #0x70]\n" + "fcvtl v27.4s, v27.4h\n" + "zip1 v20.4s, v29.4s, v27.4s\n" + "prfm pldl1keep, [x20, #0x70]\n" + "fcvtl v26.4s, v26.4h\n" + "zip2 v18.4s, v29.4s, v27.4s\n" + "fcvtl v25.4s, v25.4h\n" + "fcvtl v24.4s, v24.4h\n" + "zip1 v19.4s, v28.4s, v26.4s\n" + "fcvtl v23.4s, v23.4h\n" + "zip2 v17.4s, v28.4s, v26.4s\n" + "fcvtl v22.4s, v22.4h\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v21.4s, v20.4s, v19.4s\n" + "subs %x[width], %x[width], #0x4\n" + "zip1 v20.4s, v18.4s, v17.4s\n" + "cmp %x[width], #0x4\n" + "zip2 v19.4s, v18.4s, v17.4s\n" + "zip1 v18.4s, v25.4s, v23.4s\n" + "zip1 v17.4s, v24.4s, v22.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q21, [%x[out_ptr], #0x20]\n" + "zip2 v18.4s, v25.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v17.4s, v24.4s, v22.4s\n" + "str q20, [%x[out_ptr], #0x40]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q19, [%x[out_ptr], #0x60]\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 6f\n" + "tbz %x[width], #1, 4f\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" + "ldr s22, [x20], #0x4\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 5f\n" + "ld1 { v29.h }[2], [x27]\n" + "ld1 { v28.h }[2], [x26]\n" + "ld1 { v27.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" + "ld1 { v22.h }[2], [x20]\n" + "mov x19, #0x3\n" + "b 5f\n" + "4:" // odd_loads_1_0 + "ldr h29, [x27, #0x0]\n" + "ldr h28, [x26, #0x0]\n" + "ldr h27, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" + "ldr h22, [x20, #0x0]\n" + "mov x19, #0x1\n" + "5:" // Odd load end + "fcvtl v29.4s, v29.4h\n" + "fcvtl v28.4s, v28.4h\n" + "fcvtl v27.4s, v27.4h\n" + "zip1 v20.4s, v29.4s, v27.4s\n" + "fcvtl v26.4s, v26.4h\n" + "fcvtl v25.4s, v25.4h\n" + "zip1 v19.4s, v28.4s, v26.4s\n" + "fcvtl v24.4s, v24.4h\n" + "fcvtl v23.4s, v23.4h\n" + "zip1 v16.4s, v20.4s, v19.4s\n" + "fcvtl v22.4s, v22.4h\n" + "zip1 v18.4s, v25.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" + "zip1 v17.4s, v24.4s, v22.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 6f\n" + "zip2 v21.4s, v20.4s, v19.4s\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q21, [%x[out_ptr], #0x0]\n" + "str q16, [%x[out_ptr], #0x10]\n" + "subs x19, x19, #0x1\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 6f\n" + "zip2 v18.4s, v29.4s, v27.4s\n" + "zip2 v17.4s, v28.4s, v26.4s\n" + "zip1 v20.4s, v18.4s, v17.4s\n" + "str q20, [%x[out_ptr], #0x0]\n" + "zip2 v18.4s, v25.4s, v23.4s\n" + "zip2 v17.4s, v24.4s, v22.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "6:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp new file mode 100644 index 0000000000..3f38859c1c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, false>( + float * &out_ptr, const float * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #2\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #2\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #2\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #2\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #2\n" + "add x22, x22, %x[row_offset], LSL #2\n" + "add x21, x21, %x[row_offset], LSL #2\n" + "add x20, x20, %x[row_offset], LSL #2\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q28, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q27, [x26], #0x10\n" + "ldr q26, [x25], #0x10\n" + "zip1 v23.4s, v28.4s, v26.4s\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q22, [x24], #0x10\n" + "zip2 v26.4s, v28.4s, v26.4s\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q25, [x23], #0x10\n" + "zip1 v20.4s, v27.4s, v22.4s\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q24, [x22], #0x10\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v23.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x22, #0x70]\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "ldr q21, [x20], #0x10\n" + "zip1 v18.4s, v25.4s, v19.4s\n" + "prfm pldl1keep, [x21, #0x70]\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v20.4s, v26.4s, v22.4s\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v16.4s, v24.4s, v21.4s\n" + "subs %x[width], %x[width], #0x4\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "cmp %x[width], #0x4\n" + "zip2 v16.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "str q23, [%x[out_ptr], #0x20]\n" + "zip2 v18.4s, v24.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q20, [%x[out_ptr], #0x40]\n" + "zip2 v17.4s, v26.4s, v22.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x60]\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 6f\n" + "tbz %x[width], #1, 4f\n" + "ldr d28, [x27], #0x8\n" + "ldr d27, [x26], #0x8\n" + "ldr d26, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d21, [x20], #0x8\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 5f\n" + "ld1 { v28.s }[2], [x27]\n" + "ld1 { v27.s }[2], [x26]\n" + "ld1 { v26.s }[2], [x25]\n" + "ld1 { v22.s }[2], [x24]\n" + "ld1 { v25.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v19.s }[2], [x21]\n" + "ld1 { v21.s }[2], [x20]\n" + "mov x19, #0x3\n" + "b 5f\n" + "4:" // odd_loads_1_0 + "ldr s28, [x27, #0x0]\n" + "ldr s27, [x26, #0x0]\n" + "ldr s26, [x25, #0x0]\n" + "ldr s22, [x24, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s19, [x21, #0x0]\n" + "ldr s21, [x20, #0x0]\n" + "mov x19, #0x1\n" + "5:" // Odd load end + "zip1 v23.4s, v28.4s, v26.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v20.4s, v27.4s, v22.4s\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v25.4s, v19.4s\n" + "zip1 v16.4s, v24.4s, v21.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 6f\n" + "zip2 v23.4s, v23.4s, v20.4s\n" + "zip2 v16.4s, v18.4s, v16.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "str q16, [%x[out_ptr], #0x10]\n" + "subs x19, x19, #0x1\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 6f\n" + "zip2 v26.4s, v28.4s, v26.4s\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "zip1 v20.4s, v26.4s, v22.4s\n" + "str q20, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "zip2 v18.4s, v24.4s, v21.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "6:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp new file mode 100644 index 0000000000..03f552a575 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, false>( + int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q30, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q29, [x26], #0x10\n" + "ldr q28, [x25], #0x10\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q27, [x24], #0x10\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q24, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q25, [x22], #0x10\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q23, [x21], #0x10\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q22, [x20], #0x10\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "prfm pldl1keep, [x21, #0x70]\n" + "subs %x[width], %x[width], #0x8\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "cmp %x[width], #0x8\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 8f\n" + "tbz %x[width], #2, 5f\n" + "ldr d30, [x27], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d22, [x20], #0x8\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x20], #0x4\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.h }[6], [x27]\n" + "ld1 { v29.h }[6], [x26]\n" + "ld1 { v28.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x24]\n" + "ld1 { v24.h }[6], [x23]\n" + "ld1 { v25.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" + "ld1 { v22.h }[6], [x20]\n" + "mov x19, #0x7\n" + "b 7f\n" + "4:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.h }[4], [x27]\n" + "ld1 { v29.h }[4], [x26]\n" + "ld1 { v28.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x24]\n" + "ld1 { v24.h }[4], [x23]\n" + "ld1 { v25.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" + "ld1 { v22.h }[4], [x20]\n" + "mov x19, #0x5\n" + "b 7f\n" + "5:" // odd_loads_2_0 + "tbz %x[width], #1, 6f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" + "ldr s22, [x20], #0x4\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.h }[2], [x27]\n" + "ld1 { v29.h }[2], [x26]\n" + "ld1 { v28.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "ld1 { v24.h }[2], [x23]\n" + "ld1 { v25.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" + "ld1 { v22.h }[2], [x20]\n" + "mov x19, #0x3\n" + "b 7f\n" + "6:" // odd_loads_1_0 + "ldr h30, [x27, #0x0]\n" + "ldr h29, [x26, #0x0]\n" + "ldr h28, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "ldr h25, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" + "ldr h22, [x20, #0x0]\n" + "mov x19, #0x1\n" + "7:" // Odd load end + "zip1 v26.8h, v30.8h, v24.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v20.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "8:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + +template<> +void interleave_block<8, 1, VLType::None, false>( + uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + int16_t * &out_cast = reinterpret_cast(out_ptr); + const int16_t * const * in_cast = reinterpret_cast(in); + + interleave_block<8, 1, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp new file mode 100644 index 0000000000..35c7719de7 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, true>( + int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v1.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v31.4s, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x24, [%x[in], #0x18]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v0.4s }, [%x[out_ptr]]\n" + "ldr q31, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x8\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0xe\n" + "ble 4f\n" + "saddw v0.4s, v0.4s, v1.4h\n" + "saddw2 v31.4s, v31.4s, v1.8h\n" + "mov x19, #0x0\n" + "movi v1.8h, #0x0\n" + "4:" // no_accumulate_16 + "ldr q30, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q29, [x26], #0x10\n" + "ldr q28, [x25], #0x10\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q27, [x24], #0x10\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q24, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q25, [x22], #0x10\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q23, [x21], #0x10\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q22, [x20], #0x10\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "prfm pldl1keep, [x21, #0x70]\n" + "add x19, x19, #0x1\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "subs %x[width], %x[width], #0x8\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "cmp %x[width], #0x8\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v20.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v16.8h, v21.8h, v19.8h\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "add v1.8h, v1.8h, v16.8h\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 10f\n" + "tbz %x[width], #2, 7f\n" + "ldr d30, [x27], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d22, [x20], #0x8\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x20], #0x4\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.h }[6], [x27]\n" + "ld1 { v29.h }[6], [x26]\n" + "ld1 { v28.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x24]\n" + "ld1 { v24.h }[6], [x23]\n" + "ld1 { v25.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" + "ld1 { v22.h }[6], [x20]\n" + "mov x19, #0x7\n" + "b 9f\n" + "6:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.h }[4], [x27]\n" + "ld1 { v29.h }[4], [x26]\n" + "ld1 { v28.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x24]\n" + "ld1 { v24.h }[4], [x23]\n" + "ld1 { v25.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" + "ld1 { v22.h }[4], [x20]\n" + "mov x19, #0x5\n" + "b 9f\n" + "7:" // odd_loads_2_0 + "tbz %x[width], #1, 8f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" + "ldr s22, [x20], #0x4\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.h }[2], [x27]\n" + "ld1 { v29.h }[2], [x26]\n" + "ld1 { v28.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "ld1 { v24.h }[2], [x23]\n" + "ld1 { v25.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" + "ld1 { v22.h }[2], [x20]\n" + "mov x19, #0x3\n" + "b 9f\n" + "8:" // odd_loads_1_0 + "ldr h30, [x27, #0x0]\n" + "ldr h29, [x26, #0x0]\n" + "ldr h28, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "ldr h25, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" + "ldr h22, [x20, #0x0]\n" + "mov x19, #0x1\n" + "9:" // Odd load end + "zip1 v26.8h, v30.8h, v24.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v17.8h, v20.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip2 v16.8h, v21.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v17.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "10:" // Odds skip + "saddw v0.4s, v0.4s, v1.4h\n" + "str q0, [%x[out_ptr], #0x0]\n" + "saddw2 v31.4s, v31.4s, v1.8h\n" + "str q31, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp new file mode 100644 index 0000000000..582836fe67 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, false>( + int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset]\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr d30, [x27], #0x8\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr d27, [x24], #0x8\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr d23, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr d26, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "sshll v30.8h, v30.8b, #0x0\n" + "sshll v29.8h, v29.8b, #0x0\n" + "prfm pldl1keep, [x21, #0x70]\n" + "sshll v28.8h, v28.8b, #0x0\n" + "prfm pldl1keep, [x20, #0x70]\n" + "sshll v27.8h, v27.8b, #0x0\n" + "sshll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "sshll v21.8h, v21.8b, #0x0\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "sshll v26.8h, v26.8b, #0x0\n" + "sshll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "subs %x[width], %x[width], #0x8\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "cmp %x[width], #0x8\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 8f\n" + "tbz %x[width], #2, 5f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "ldr s25, [x20], #0x4\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v21.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v21.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "mov x19, #0x7\n" + "b 7f\n" + "4:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v21.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "mov x19, #0x5\n" + "b 7f\n" + "5:" // odd_loads_2_0 + "tbz %x[width], #1, 6f\n" + "ldr h30, [x27], #0x2\n" + "ldr h29, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h27, [x24], #0x2\n" + "ldr h23, [x23], #0x2\n" + "ldr h21, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" + "ldr h25, [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v21.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "mov x19, #0x3\n" + "b 7f\n" + "6:" // odd_loads_1_0 + "ldr b30, [x27, #0x0]\n" + "ldr b29, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b27, [x24, #0x0]\n" + "ldr b23, [x23, #0x0]\n" + "ldr b21, [x22, #0x0]\n" + "ldr b26, [x21, #0x0]\n" + "ldr b25, [x20, #0x0]\n" + "mov x19, #0x1\n" + "7:" // Odd load end + "sshll v30.8h, v30.8b, #0x0\n" + "sshll v29.8h, v29.8b, #0x0\n" + "sshll v28.8h, v28.8b, #0x0\n" + "sshll v27.8h, v27.8b, #0x0\n" + "sshll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "sshll v21.8h, v21.8b, #0x0\n" + "sshll v26.8h, v26.8b, #0x0\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "sshll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "8:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp new file mode 100644 index 0000000000..35dc3dc0d4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, true>( + int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v1.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v31.4s, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset]\n" + "ldr x24, [%x[in], #0x18]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v0.4s }, [%x[out_ptr]]\n" + "ldr q31, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x8\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0xe\n" + "ble 4f\n" + "saddw v0.4s, v0.4s, v1.4h\n" + "saddw2 v31.4s, v31.4s, v1.8h\n" + "mov x19, #0x0\n" + "movi v1.8h, #0x0\n" + "4:" // no_accumulate_16 + "ldr d30, [x27], #0x8\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr d27, [x24], #0x8\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr d23, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr d26, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "sshll v30.8h, v30.8b, #0x0\n" + "sshll v29.8h, v29.8b, #0x0\n" + "prfm pldl1keep, [x21, #0x70]\n" + "sshll v28.8h, v28.8b, #0x0\n" + "prfm pldl1keep, [x20, #0x70]\n" + "sshll v27.8h, v27.8b, #0x0\n" + "sshll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "sshll v21.8h, v21.8b, #0x0\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "sshll v26.8h, v26.8b, #0x0\n" + "sshll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "add x19, x19, #0x1\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "subs %x[width], %x[width], #0x8\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "cmp %x[width], #0x8\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "add v1.8h, v1.8h, v16.8h\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 10f\n" + "tbz %x[width], #2, 7f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "ldr s25, [x20], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v21.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v21.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "mov x19, #0x7\n" + "b 9f\n" + "6:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v21.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "mov x19, #0x5\n" + "b 9f\n" + "7:" // odd_loads_2_0 + "tbz %x[width], #1, 8f\n" + "ldr h30, [x27], #0x2\n" + "ldr h29, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h27, [x24], #0x2\n" + "ldr h23, [x23], #0x2\n" + "ldr h21, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" + "ldr h25, [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v21.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "mov x19, #0x3\n" + "b 9f\n" + "8:" // odd_loads_1_0 + "ldr b30, [x27, #0x0]\n" + "ldr b29, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b27, [x24, #0x0]\n" + "ldr b23, [x23, #0x0]\n" + "ldr b21, [x22, #0x0]\n" + "ldr b26, [x21, #0x0]\n" + "ldr b25, [x20, #0x0]\n" + "mov x19, #0x1\n" + "9:" // Odd load end + "sshll v30.8h, v30.8b, #0x0\n" + "sshll v29.8h, v29.8b, #0x0\n" + "sshll v28.8h, v28.8b, #0x0\n" + "sshll v27.8h, v27.8b, #0x0\n" + "sshll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "sshll v21.8h, v21.8b, #0x0\n" + "sshll v26.8h, v26.8b, #0x0\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "sshll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "10:" // Odds skip + "saddw v0.4s, v0.4s, v1.4h\n" + "str q0, [%x[out_ptr], #0x0]\n" + "saddw2 v31.4s, v31.4s, v1.8h\n" + "str q31, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp new file mode 100644 index 0000000000..bfa8989a4d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, true>( + uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v1.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v31.4s, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x24, [%x[in], #0x18]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v0.4s }, [%x[out_ptr]]\n" + "ldr q31, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x8\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0xe\n" + "ble 4f\n" + "uaddw v0.4s, v0.4s, v1.4h\n" + "uaddw2 v31.4s, v31.4s, v1.8h\n" + "mov x19, #0x0\n" + "movi v1.8h, #0x0\n" + "4:" // no_accumulate_16 + "ldr q30, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q29, [x26], #0x10\n" + "ldr q28, [x25], #0x10\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q27, [x24], #0x10\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q24, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q25, [x22], #0x10\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q23, [x21], #0x10\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q22, [x20], #0x10\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "prfm pldl1keep, [x21, #0x70]\n" + "add x19, x19, #0x1\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "subs %x[width], %x[width], #0x8\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "cmp %x[width], #0x8\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v20.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v16.8h, v21.8h, v19.8h\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "add v1.8h, v1.8h, v16.8h\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 10f\n" + "tbz %x[width], #2, 7f\n" + "ldr d30, [x27], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d22, [x20], #0x8\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x20], #0x4\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.h }[6], [x27]\n" + "ld1 { v29.h }[6], [x26]\n" + "ld1 { v28.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x24]\n" + "ld1 { v24.h }[6], [x23]\n" + "ld1 { v25.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" + "ld1 { v22.h }[6], [x20]\n" + "mov x19, #0x7\n" + "b 9f\n" + "6:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.h }[4], [x27]\n" + "ld1 { v29.h }[4], [x26]\n" + "ld1 { v28.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x24]\n" + "ld1 { v24.h }[4], [x23]\n" + "ld1 { v25.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" + "ld1 { v22.h }[4], [x20]\n" + "mov x19, #0x5\n" + "b 9f\n" + "7:" // odd_loads_2_0 + "tbz %x[width], #1, 8f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" + "ldr s22, [x20], #0x4\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.h }[2], [x27]\n" + "ld1 { v29.h }[2], [x26]\n" + "ld1 { v28.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "ld1 { v24.h }[2], [x23]\n" + "ld1 { v25.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" + "ld1 { v22.h }[2], [x20]\n" + "mov x19, #0x3\n" + "b 9f\n" + "8:" // odd_loads_1_0 + "ldr h30, [x27, #0x0]\n" + "ldr h29, [x26, #0x0]\n" + "ldr h28, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "ldr h25, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" + "ldr h22, [x20, #0x0]\n" + "mov x19, #0x1\n" + "9:" // Odd load end + "zip1 v26.8h, v30.8h, v24.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v20.8h, v26.8h, v18.8h\n" + "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v17.8h, v20.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v26.8h, v18.8h\n" + "zip2 v16.8h, v21.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v17.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "zip2 v21.8h, v28.8h, v23.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v22.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "10:" // Odds skip + "uaddw v0.4s, v0.4s, v1.4h\n" + "str q0, [%x[out_ptr], #0x0]\n" + "uaddw2 v31.4s, v31.4s, v1.8h\n" + "str q31, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp new file mode 100644 index 0000000000..86b90f1898 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, false>( + uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset]\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr d30, [x27], #0x8\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr d27, [x24], #0x8\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr d23, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr d26, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "prfm pldl1keep, [x20, #0x70]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "ushll v21.8h, v21.8b, #0x0\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ushll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "subs %x[width], %x[width], #0x8\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "cmp %x[width], #0x8\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 8f\n" + "tbz %x[width], #2, 5f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "ldr s25, [x20], #0x4\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v21.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v21.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "mov x19, #0x7\n" + "b 7f\n" + "4:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v21.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "mov x19, #0x5\n" + "b 7f\n" + "5:" // odd_loads_2_0 + "tbz %x[width], #1, 6f\n" + "ldr h30, [x27], #0x2\n" + "ldr h29, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h27, [x24], #0x2\n" + "ldr h23, [x23], #0x2\n" + "ldr h21, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" + "ldr h25, [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v21.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "mov x19, #0x3\n" + "b 7f\n" + "6:" // odd_loads_1_0 + "ldr b30, [x27, #0x0]\n" + "ldr b29, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b27, [x24, #0x0]\n" + "ldr b23, [x23, #0x0]\n" + "ldr b21, [x22, #0x0]\n" + "ldr b26, [x21, #0x0]\n" + "ldr b25, [x20, #0x0]\n" + "mov x19, #0x1\n" + "7:" // Odd load end + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "ushll v21.8h, v21.8b, #0x0\n" + "ushll v26.8h, v26.8b, #0x0\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 8f\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "8:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp new file mode 100644 index 0000000000..cefb70c57b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 1, VLType::None, true>( + uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v1.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v31.4s, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset]\n" + "ldr x24, [%x[in], #0x18]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v0.4s }, [%x[out_ptr]]\n" + "ldr q31, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x8\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0xe\n" + "ble 4f\n" + "uaddw v0.4s, v0.4s, v1.4h\n" + "uaddw2 v31.4s, v31.4s, v1.8h\n" + "mov x19, #0x0\n" + "movi v1.8h, #0x0\n" + "4:" // no_accumulate_16 + "ldr d30, [x27], #0x8\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr d27, [x24], #0x8\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr d23, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr d26, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "prfm pldl1keep, [x20, #0x70]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "ushll v21.8h, v21.8b, #0x0\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ushll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "add x19, x19, #0x1\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "subs %x[width], %x[width], #0x8\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "cmp %x[width], #0x8\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x40]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "add v1.8h, v1.8h, v16.8h\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 10f\n" + "tbz %x[width], #2, 7f\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "ldr s25, [x20], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v21.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "mov x19, #0x6\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v21.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "mov x19, #0x7\n" + "b 9f\n" + "6:" // odd_loads_1_4 + "mov x19, #0x4\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v21.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "mov x19, #0x5\n" + "b 9f\n" + "7:" // odd_loads_2_0 + "tbz %x[width], #1, 8f\n" + "ldr h30, [x27], #0x2\n" + "ldr h29, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h27, [x24], #0x2\n" + "ldr h23, [x23], #0x2\n" + "ldr h21, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" + "ldr h25, [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 9f\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v21.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "mov x19, #0x3\n" + "b 9f\n" + "8:" // odd_loads_1_0 + "ldr b30, [x27, #0x0]\n" + "ldr b29, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b27, [x24, #0x0]\n" + "ldr b23, [x23, #0x0]\n" + "ldr b21, [x22, #0x0]\n" + "ldr b26, [x21, #0x0]\n" + "ldr b25, [x20, #0x0]\n" + "mov x19, #0x1\n" + "9:" // Odd load end + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v23.8h, v23.8b, #0x0\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "ushll v21.8h, v21.8b, #0x0\n" + "ushll v26.8h, v26.8b, #0x0\n" + "zip1 v20.8h, v28.8h, v26.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v24.8h, v20.8h\n" + "zip1 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v24.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v28.8h, v26.8h\n" + "subs x19, x19, #0x1\n" + "zip1 v18.8h, v23.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v25.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x19, x19, #0x1\n" + "add v1.8h, v1.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "beq 10f\n" + "zip2 v18.8h, v23.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" + "add %x[out_ptr], %x[out_ptr], #0x10\n" + "10:" // Odds skip + "uaddw v0.4s, v0.4s, v1.4h\n" + "str q0, [%x[out_ptr], #0x0]\n" + "uaddw2 v31.4s, v31.4s, v1.8h\n" + "str q31, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp new file mode 100644 index 0000000000..5377edc1e1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 2, VLType::None, false>( + bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q28, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q27, [x26], #0x10\n" + "ldr q26, [x25], #0x10\n" + "zip1 v23.4s, v28.4s, v26.4s\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q22, [x24], #0x10\n" + "zip2 v26.4s, v28.4s, v26.4s\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q25, [x23], #0x10\n" + "zip1 v20.4s, v27.4s, v22.4s\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q24, [x22], #0x10\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v23.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x22, #0x70]\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "ldr q21, [x20], #0x10\n" + "zip1 v18.4s, v25.4s, v19.4s\n" + "prfm pldl1keep, [x21, #0x70]\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v20.4s, v26.4s, v22.4s\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v16.4s, v24.4s, v21.4s\n" + "subs %x[width], %x[width], #0x8\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "cmp %x[width], #0x8\n" + "zip2 v16.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "str q23, [%x[out_ptr], #0x20]\n" + "zip2 v18.4s, v24.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q20, [%x[out_ptr], #0x40]\n" + "zip2 v17.4s, v26.4s, v22.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x60]\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 8f\n" + "tbz %x[width], #2, 5f\n" + "ldr d28, [x27], #0x8\n" + "ldr d27, [x26], #0x8\n" + "ldr d26, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d21, [x20], #0x8\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v28.s }[2], [x27], #0x4\n" + "ld1 { v27.s }[2], [x26], #0x4\n" + "ld1 { v26.s }[2], [x25], #0x4\n" + "ld1 { v22.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v21.s }[2], [x20], #0x4\n" + "mov x19, #0x3\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v28.h }[6], [x27]\n" + "ld1 { v27.h }[6], [x26]\n" + "ld1 { v26.h }[6], [x25]\n" + "ld1 { v22.h }[6], [x24]\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "ld1 { v19.h }[6], [x21]\n" + "ld1 { v21.h }[6], [x20]\n" + "mov x19, #0x4\n" + "b 7f\n" + "4:" // odd_loads_1_4 + "mov x19, #0x2\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v28.h }[4], [x27]\n" + "ld1 { v27.h }[4], [x26]\n" + "ld1 { v26.h }[4], [x25]\n" + "ld1 { v22.h }[4], [x24]\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "ld1 { v19.h }[4], [x21]\n" + "ld1 { v21.h }[4], [x20]\n" + "mov x19, #0x3\n" + "b 7f\n" + "5:" // odd_loads_2_0 + "tbz %x[width], #1, 6f\n" + "ldr s28, [x27], #0x4\n" + "ldr s27, [x26], #0x4\n" + "ldr s26, [x25], #0x4\n" + "ldr s22, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s21, [x20], #0x4\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v28.h }[2], [x27]\n" + "ld1 { v27.h }[2], [x26]\n" + "ld1 { v26.h }[2], [x25]\n" + "ld1 { v22.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "ld1 { v19.h }[2], [x21]\n" + "ld1 { v21.h }[2], [x20]\n" + "mov x19, #0x2\n" + "b 7f\n" + "6:" // odd_loads_1_0 + "ldr h28, [x27, #0x0]\n" + "ldr h27, [x26, #0x0]\n" + "ldr h26, [x25, #0x0]\n" + "ldr h22, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "ldr h19, [x21, #0x0]\n" + "ldr h21, [x20, #0x0]\n" + "mov x19, #0x1\n" + "7:" // Odd load end + "zip1 v23.4s, v28.4s, v26.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v20.4s, v27.4s, v22.4s\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v25.4s, v19.4s\n" + "zip1 v16.4s, v24.4s, v21.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 8f\n" + "zip2 v23.4s, v23.4s, v20.4s\n" + "zip2 v16.4s, v18.4s, v16.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "str q16, [%x[out_ptr], #0x10]\n" + "subs x19, x19, #0x1\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 8f\n" + "zip2 v26.4s, v28.4s, v26.4s\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v20.4s, v26.4s, v22.4s\n" + "str q20, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "zip2 v18.4s, v24.4s, v21.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 8f\n" + "zip2 v17.4s, v26.4s, v22.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "8:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp new file mode 100644 index 0000000000..3aea6a8999 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 2, VLType::None, false>( + float * &out_ptr, const float * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #2\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #2\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #2\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #2\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #2\n" + "add x22, x22, %x[row_offset], LSL #2\n" + "add x21, x21, %x[row_offset], LSL #2\n" + "add x20, x20, %x[row_offset], LSL #2\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q27, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q24, [x26], #0x10\n" + "zip1 v26.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q25, [x25], #0x10\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q21, [x24], #0x10\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q22, [x23], #0x10\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q18, [x22], #0x10\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x20], #0x10\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "str q20, [%x[out_ptr], #0x20]\n" + "str q17, [%x[out_ptr], #0x30]\n" + "str q24, [%x[out_ptr], #0x40]\n" + "str q21, [%x[out_ptr], #0x50]\n" + "str q18, [%x[out_ptr], #0x60]\n" + "str q16, [%x[out_ptr], #0x70]\n" + "subs %x[width], %x[width], #0x4\n" + "cmp %x[width], #0x4\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 6f\n" + "tbz %x[width], #1, 4f\n" + "ldr d27, [x27], #0x8\n" + "ldr d24, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 5f\n" + "ld1 { v27.s }[2], [x27]\n" + "ld1 { v24.s }[2], [x26]\n" + "ld1 { v25.s }[2], [x25]\n" + "ld1 { v21.s }[2], [x24]\n" + "ld1 { v22.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v19.s }[2], [x21]\n" + "ld1 { v16.s }[2], [x20]\n" + "mov x19, #0x2\n" + "b 5f\n" + "4:" // odd_loads_1_0 + "ldr s27, [x27, #0x0]\n" + "ldr s24, [x26, #0x0]\n" + "ldr s25, [x25, #0x0]\n" + "ldr s21, [x24, #0x0]\n" + "ldr s22, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s19, [x21, #0x0]\n" + "ldr s16, [x20, #0x0]\n" + "mov x19, #0x1\n" + "5:" // Odd load end + "zip1 v26.2d, v27.2d, v24.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "str q20, [%x[out_ptr], #0x20]\n" + "str q17, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "beq 6f\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "str q24, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "str q21, [%x[out_ptr], #0x10]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q18, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "6:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp new file mode 100644 index 0000000000..4780b77a4a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 4, VLType::None, false>( + bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset], LSL #1\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset], LSL #1\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset], LSL #1\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset], LSL #1\n" + "add x22, x22, %x[row_offset], LSL #1\n" + "add x21, x21, %x[row_offset], LSL #1\n" + "add x20, x20, %x[row_offset], LSL #1\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q27, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q24, [x26], #0x10\n" + "zip1 v26.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q25, [x25], #0x10\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q21, [x24], #0x10\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q22, [x23], #0x10\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q18, [x22], #0x10\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x20], #0x10\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "str q20, [%x[out_ptr], #0x20]\n" + "str q17, [%x[out_ptr], #0x30]\n" + "str q24, [%x[out_ptr], #0x40]\n" + "str q21, [%x[out_ptr], #0x50]\n" + "str q18, [%x[out_ptr], #0x60]\n" + "str q16, [%x[out_ptr], #0x70]\n" + "subs %x[width], %x[width], #0x8\n" + "cmp %x[width], #0x8\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 8f\n" + "tbz %x[width], #2, 5f\n" + "ldr d27, [x27], #0x8\n" + "ldr d24, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v27.s }[2], [x27], #0x4\n" + "ld1 { v24.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v18.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v27.h }[6], [x27]\n" + "ld1 { v24.h }[6], [x26]\n" + "ld1 { v25.h }[6], [x25]\n" + "ld1 { v21.h }[6], [x24]\n" + "ld1 { v22.h }[6], [x23]\n" + "ld1 { v18.h }[6], [x22]\n" + "ld1 { v19.h }[6], [x21]\n" + "ld1 { v16.h }[6], [x20]\n" + "b 7f\n" + "4:" // odd_loads_1_4 + "mov x19, #0x1\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v27.h }[4], [x27]\n" + "ld1 { v24.h }[4], [x26]\n" + "ld1 { v25.h }[4], [x25]\n" + "ld1 { v21.h }[4], [x24]\n" + "ld1 { v22.h }[4], [x23]\n" + "ld1 { v18.h }[4], [x22]\n" + "ld1 { v19.h }[4], [x21]\n" + "ld1 { v16.h }[4], [x20]\n" + "mov x19, #0x2\n" + "b 7f\n" + "5:" // odd_loads_2_0 + "tbz %x[width], #1, 6f\n" + "ldr s27, [x27], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s18, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 7f\n" + "ld1 { v27.h }[2], [x27]\n" + "ld1 { v24.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v22.h }[2], [x23]\n" + "ld1 { v18.h }[2], [x22]\n" + "ld1 { v19.h }[2], [x21]\n" + "ld1 { v16.h }[2], [x20]\n" + "b 7f\n" + "6:" // odd_loads_1_0 + "ldr h27, [x27, #0x0]\n" + "ldr h24, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h22, [x23, #0x0]\n" + "ldr h18, [x22, #0x0]\n" + "ldr h19, [x21, #0x0]\n" + "ldr h16, [x20, #0x0]\n" + "mov x19, #0x1\n" + "7:" // Odd load end + "zip1 v26.2d, v27.2d, v24.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "str q20, [%x[out_ptr], #0x20]\n" + "str q17, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "beq 8f\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "str q24, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "str q21, [%x[out_ptr], #0x10]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q18, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "8:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp new file mode 100644 index 0000000000..a9034f5742 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 4, VLType::None, false>( + int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset]\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x10\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q28, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q27, [x26], #0x10\n" + "ldr q26, [x25], #0x10\n" + "zip1 v23.4s, v28.4s, v26.4s\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q22, [x24], #0x10\n" + "zip2 v26.4s, v28.4s, v26.4s\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q25, [x23], #0x10\n" + "zip1 v20.4s, v27.4s, v22.4s\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q24, [x22], #0x10\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v23.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x22, #0x70]\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "ldr q21, [x20], #0x10\n" + "zip1 v18.4s, v25.4s, v19.4s\n" + "prfm pldl1keep, [x21, #0x70]\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v20.4s, v26.4s, v22.4s\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v16.4s, v24.4s, v21.4s\n" + "subs %x[width], %x[width], #0x10\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "cmp %x[width], #0x10\n" + "zip2 v16.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "str q23, [%x[out_ptr], #0x20]\n" + "zip2 v18.4s, v24.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q20, [%x[out_ptr], #0x40]\n" + "zip2 v17.4s, v26.4s, v22.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x60]\n" + "str q16, [%x[out_ptr], #0x70]\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 12f\n" + "tbz %x[width], #3, 7f\n" + "ldr d28, [x27], #0x8\n" + "ldr d27, [x26], #0x8\n" + "ldr d26, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d21, [x20], #0x8\n" + "tbz %x[width], #2, 5f\n" + "ld1 { v28.s }[2], [x27], #0x4\n" + "ld1 { v27.s }[2], [x26], #0x4\n" + "ld1 { v26.s }[2], [x25], #0x4\n" + "ld1 { v22.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v21.s }[2], [x20], #0x4\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v28.h }[6], [x27], #0x2\n" + "ld1 { v27.h }[6], [x26], #0x2\n" + "ld1 { v26.h }[6], [x25], #0x2\n" + "ld1 { v22.h }[6], [x24], #0x2\n" + "ld1 { v25.h }[6], [x23], #0x2\n" + "ld1 { v24.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v21.h }[6], [x20], #0x2\n" + "mov x19, #0x4\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v28.b }[14], [x27]\n" + "ld1 { v27.b }[14], [x26]\n" + "ld1 { v26.b }[14], [x25]\n" + "ld1 { v22.b }[14], [x24]\n" + "ld1 { v25.b }[14], [x23]\n" + "ld1 { v24.b }[14], [x22]\n" + "ld1 { v19.b }[14], [x21]\n" + "ld1 { v21.b }[14], [x20]\n" + "b 11f\n" + "4:" // odd_loads_1_12 + "mov x19, #0x3\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v28.b }[12], [x27]\n" + "ld1 { v27.b }[12], [x26]\n" + "ld1 { v26.b }[12], [x25]\n" + "ld1 { v22.b }[12], [x24]\n" + "ld1 { v25.b }[12], [x23]\n" + "ld1 { v24.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" + "ld1 { v21.b }[12], [x20]\n" + "mov x19, #0x4\n" + "b 11f\n" + "5:" // odd_loads_2_8 + "tbz %x[width], #1, 6f\n" + "ld1 { v28.h }[4], [x27], #0x2\n" + "ld1 { v27.h }[4], [x26], #0x2\n" + "ld1 { v26.h }[4], [x25], #0x2\n" + "ld1 { v22.h }[4], [x24], #0x2\n" + "ld1 { v25.h }[4], [x23], #0x2\n" + "ld1 { v24.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v21.h }[4], [x20], #0x2\n" + "mov x19, #0x3\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v28.b }[10], [x27]\n" + "ld1 { v27.b }[10], [x26]\n" + "ld1 { v26.b }[10], [x25]\n" + "ld1 { v22.b }[10], [x24]\n" + "ld1 { v25.b }[10], [x23]\n" + "ld1 { v24.b }[10], [x22]\n" + "ld1 { v19.b }[10], [x21]\n" + "ld1 { v21.b }[10], [x20]\n" + "b 11f\n" + "6:" // odd_loads_1_8 + "mov x19, #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v28.b }[8], [x27]\n" + "ld1 { v27.b }[8], [x26]\n" + "ld1 { v26.b }[8], [x25]\n" + "ld1 { v22.b }[8], [x24]\n" + "ld1 { v25.b }[8], [x23]\n" + "ld1 { v24.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" + "ld1 { v21.b }[8], [x20]\n" + "mov x19, #0x3\n" + "b 11f\n" + "7:" // odd_loads_4_0 + "tbz %x[width], #2, 9f\n" + "ldr s28, [x27], #0x4\n" + "ldr s27, [x26], #0x4\n" + "ldr s26, [x25], #0x4\n" + "ldr s22, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s21, [x20], #0x4\n" + "tbz %x[width], #1, 8f\n" + "ld1 { v28.h }[2], [x27], #0x2\n" + "ld1 { v27.h }[2], [x26], #0x2\n" + "ld1 { v26.h }[2], [x25], #0x2\n" + "ld1 { v22.h }[2], [x24], #0x2\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v21.h }[2], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v28.b }[6], [x27]\n" + "ld1 { v27.b }[6], [x26]\n" + "ld1 { v26.b }[6], [x25]\n" + "ld1 { v22.b }[6], [x24]\n" + "ld1 { v25.b }[6], [x23]\n" + "ld1 { v24.b }[6], [x22]\n" + "ld1 { v19.b }[6], [x21]\n" + "ld1 { v21.b }[6], [x20]\n" + "b 11f\n" + "8:" // odd_loads_1_4 + "mov x19, #0x1\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v28.b }[4], [x27]\n" + "ld1 { v27.b }[4], [x26]\n" + "ld1 { v26.b }[4], [x25]\n" + "ld1 { v22.b }[4], [x24]\n" + "ld1 { v25.b }[4], [x23]\n" + "ld1 { v24.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" + "ld1 { v21.b }[4], [x20]\n" + "mov x19, #0x2\n" + "b 11f\n" + "9:" // odd_loads_2_0 + "tbz %x[width], #1, 10f\n" + "ldr h28, [x27], #0x2\n" + "ldr h27, [x26], #0x2\n" + "ldr h26, [x25], #0x2\n" + "ldr h22, [x24], #0x2\n" + "ldr h25, [x23], #0x2\n" + "ldr h24, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" + "ldr h21, [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v28.b }[2], [x27]\n" + "ld1 { v27.b }[2], [x26]\n" + "ld1 { v26.b }[2], [x25]\n" + "ld1 { v22.b }[2], [x24]\n" + "ld1 { v25.b }[2], [x23]\n" + "ld1 { v24.b }[2], [x22]\n" + "ld1 { v19.b }[2], [x21]\n" + "ld1 { v21.b }[2], [x20]\n" + "b 11f\n" + "10:" // odd_loads_1_0 + "ldr b28, [x27, #0x0]\n" + "ldr b27, [x26, #0x0]\n" + "ldr b26, [x25, #0x0]\n" + "ldr b22, [x24, #0x0]\n" + "ldr b25, [x23, #0x0]\n" + "ldr b24, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" + "ldr b21, [x20, #0x0]\n" + "mov x19, #0x1\n" + "11:" // Odd load end + "zip1 v23.4s, v28.4s, v26.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v20.4s, v27.4s, v22.4s\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v25.4s, v19.4s\n" + "zip1 v16.4s, v24.4s, v21.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 12f\n" + "zip2 v23.4s, v23.4s, v20.4s\n" + "zip2 v16.4s, v18.4s, v16.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "str q16, [%x[out_ptr], #0x10]\n" + "subs x19, x19, #0x1\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 12f\n" + "zip2 v26.4s, v28.4s, v26.4s\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v20.4s, v26.4s, v22.4s\n" + "str q20, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "zip2 v18.4s, v24.4s, v21.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 12f\n" + "zip2 v17.4s, v26.4s, v22.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "12:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + +template<> +void interleave_block<8, 4, VLType::None, false>( + uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + int8_t * &out_cast = reinterpret_cast(out_ptr); + const int8_t * const * in_cast = reinterpret_cast(in); + + interleave_block<8, 4, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp new file mode 100644 index 0000000000..2831cb79a6 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 4, VLType::None, true>( + int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v1.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v0.8h, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v31.4s, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset]\n" + "movi v30.4s, #0x0\n" + "ldr x24, [%x[in], #0x18]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v31.4s }, [%x[out_ptr]]\n" + "ldr q30, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x10\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0x1e\n" + "ble 4f\n" + "sadalp v31.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "sadalp v30.4s, v0.8h\n" + "movi v0.8h, #0x0\n" + "mov x19, #0x0\n" + "4:" // no_accumulate_16 + "ldr q29, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q28, [x26], #0x10\n" + "ldr q27, [x25], #0x10\n" + "zip1 v23.4s, v29.4s, v27.4s\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q21, [x24], #0x10\n" + "zip2 v27.4s, v29.4s, v27.4s\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q26, [x23], #0x10\n" + "zip1 v20.4s, v28.4s, v21.4s\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q25, [x22], #0x10\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v24.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x22, #0x70]\n" + "zip2 v23.4s, v28.4s, v21.4s\n" + "ldr q22, [x20], #0x10\n" + "zip1 v18.4s, v26.4s, v19.4s\n" + "prfm pldl1keep, [x21, #0x70]\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v21.4s, v27.4s, v23.4s\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v17.4s, v25.4s, v22.4s\n" + "sadalp v1.8h, v16.16b\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "add x19, x19, #0x1\n" + "zip2 v20.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v19.4s, v26.4s, v19.4s\n" + "sadalp v0.8h, v16.16b\n" + "zip2 v16.4s, v25.4s, v22.4s\n" + "str q24, [%x[out_ptr], #0x20]\n" + "zip1 v18.4s, v19.4s, v16.4s\n" + "sadalp v1.8h, v24.16b\n" + "zip2 v17.4s, v27.4s, v23.4s\n" + "str q20, [%x[out_ptr], #0x30]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "sadalp v0.8h, v20.16b\n" + "str q17, [%x[out_ptr], #0x60]\n" + "sadalp v1.8h, v21.16b\n" + "str q16, [%x[out_ptr], #0x70]\n" + "subs %x[width], %x[width], #0x10\n" + "sadalp v0.8h, v18.16b\n" + "cmp %x[width], #0x10\n" + "sadalp v1.8h, v17.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "sadalp v0.8h, v16.16b\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 14f\n" + "tbz %x[width], #3, 9f\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d22, [x20], #0x8\n" + "tbz %x[width], #2, 7f\n" + "ld1 { v29.s }[2], [x27], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x20], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v29.h }[6], [x27], #0x2\n" + "ld1 { v28.h }[6], [x26], #0x2\n" + "ld1 { v27.h }[6], [x25], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v26.h }[6], [x23], #0x2\n" + "ld1 { v25.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v22.h }[6], [x20], #0x2\n" + "mov x19, #0x4\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[14], [x27]\n" + "ld1 { v28.b }[14], [x26]\n" + "ld1 { v27.b }[14], [x25]\n" + "ld1 { v21.b }[14], [x24]\n" + "ld1 { v26.b }[14], [x23]\n" + "ld1 { v25.b }[14], [x22]\n" + "ld1 { v19.b }[14], [x21]\n" + "ld1 { v22.b }[14], [x20]\n" + "b 13f\n" + "6:" // odd_loads_1_12 + "mov x19, #0x3\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[12], [x27]\n" + "ld1 { v28.b }[12], [x26]\n" + "ld1 { v27.b }[12], [x25]\n" + "ld1 { v21.b }[12], [x24]\n" + "ld1 { v26.b }[12], [x23]\n" + "ld1 { v25.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" + "ld1 { v22.b }[12], [x20]\n" + "mov x19, #0x4\n" + "b 13f\n" + "7:" // odd_loads_2_8 + "tbz %x[width], #1, 8f\n" + "ld1 { v29.h }[4], [x27], #0x2\n" + "ld1 { v28.h }[4], [x26], #0x2\n" + "ld1 { v27.h }[4], [x25], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v26.h }[4], [x23], #0x2\n" + "ld1 { v25.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v22.h }[4], [x20], #0x2\n" + "mov x19, #0x3\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[10], [x27]\n" + "ld1 { v28.b }[10], [x26]\n" + "ld1 { v27.b }[10], [x25]\n" + "ld1 { v21.b }[10], [x24]\n" + "ld1 { v26.b }[10], [x23]\n" + "ld1 { v25.b }[10], [x22]\n" + "ld1 { v19.b }[10], [x21]\n" + "ld1 { v22.b }[10], [x20]\n" + "b 13f\n" + "8:" // odd_loads_1_8 + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[8], [x27]\n" + "ld1 { v28.b }[8], [x26]\n" + "ld1 { v27.b }[8], [x25]\n" + "ld1 { v21.b }[8], [x24]\n" + "ld1 { v26.b }[8], [x23]\n" + "ld1 { v25.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" + "ld1 { v22.b }[8], [x20]\n" + "mov x19, #0x3\n" + "b 13f\n" + "9:" // odd_loads_4_0 + "tbz %x[width], #2, 11f\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s22, [x20], #0x4\n" + "tbz %x[width], #1, 10f\n" + "ld1 { v29.h }[2], [x27], #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v26.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v22.h }[2], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[6], [x27]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v26.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v19.b }[6], [x21]\n" + "ld1 { v22.b }[6], [x20]\n" + "b 13f\n" + "10:" // odd_loads_1_4 + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[4], [x27]\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v26.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" + "ld1 { v22.b }[4], [x20]\n" + "mov x19, #0x2\n" + "b 13f\n" + "11:" // odd_loads_2_0 + "tbz %x[width], #1, 12f\n" + "ldr h29, [x27], #0x2\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h26, [x23], #0x2\n" + "ldr h25, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" + "ldr h22, [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[2], [x27]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v26.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v19.b }[2], [x21]\n" + "ld1 { v22.b }[2], [x20]\n" + "b 13f\n" + "12:" // odd_loads_1_0 + "ldr b29, [x27, #0x0]\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b26, [x23, #0x0]\n" + "ldr b25, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" + "ldr b22, [x20, #0x0]\n" + "mov x19, #0x1\n" + "13:" // Odd load end + "zip1 v23.4s, v29.4s, v27.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v20.4s, v28.4s, v21.4s\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v26.4s, v19.4s\n" + "sadalp v1.8h, v16.16b\n" + "zip1 v17.4s, v25.4s, v22.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "sadalp v0.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 14f\n" + "zip2 v24.4s, v23.4s, v20.4s\n" + "zip2 v20.4s, v18.4s, v17.4s\n" + "str q24, [%x[out_ptr], #0x0]\n" + "sadalp v1.8h, v24.16b\n" + "str q20, [%x[out_ptr], #0x10]\n" + "sadalp v0.8h, v20.16b\n" + "subs x19, x19, #0x1\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 14f\n" + "zip2 v27.4s, v29.4s, v27.4s\n" + "zip2 v23.4s, v28.4s, v21.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v21.4s, v27.4s, v23.4s\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v26.4s, v19.4s\n" + "sadalp v1.8h, v21.16b\n" + "zip2 v16.4s, v25.4s, v22.4s\n" + "zip1 v18.4s, v19.4s, v16.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" + "sadalp v0.8h, v18.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 14f\n" + "zip2 v17.4s, v27.4s, v23.4s\n" + "zip2 v16.4s, v19.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "sadalp v1.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "sadalp v0.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "14:" // Odds skip + "sadalp v31.4s, v1.8h\n" + "sadalp v30.4s, v0.8h\n" + "str q31, [%x[out_ptr], #0x0]\n" + "str q30, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp new file mode 100644 index 0000000000..7c7857bcd0 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 4, VLType::None, true>( + uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v1.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v0.8h, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v31.4s, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset]\n" + "movi v30.4s, #0x0\n" + "ldr x24, [%x[in], #0x18]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v31.4s }, [%x[out_ptr]]\n" + "ldr q30, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x10\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0x1e\n" + "ble 4f\n" + "uadalp v31.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "uadalp v30.4s, v0.8h\n" + "movi v0.8h, #0x0\n" + "mov x19, #0x0\n" + "4:" // no_accumulate_16 + "ldr q29, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q28, [x26], #0x10\n" + "ldr q27, [x25], #0x10\n" + "zip1 v23.4s, v29.4s, v27.4s\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q21, [x24], #0x10\n" + "zip2 v27.4s, v29.4s, v27.4s\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q26, [x23], #0x10\n" + "zip1 v20.4s, v28.4s, v21.4s\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q25, [x22], #0x10\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v24.4s, v23.4s, v20.4s\n" + "prfm pldl1keep, [x22, #0x70]\n" + "zip2 v23.4s, v28.4s, v21.4s\n" + "ldr q22, [x20], #0x10\n" + "zip1 v18.4s, v26.4s, v19.4s\n" + "prfm pldl1keep, [x21, #0x70]\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v21.4s, v27.4s, v23.4s\n" + "prfm pldl1keep, [x20, #0x70]\n" + "zip1 v17.4s, v25.4s, v22.4s\n" + "uadalp v1.8h, v16.16b\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "add x19, x19, #0x1\n" + "zip2 v20.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v19.4s, v26.4s, v19.4s\n" + "uadalp v0.8h, v16.16b\n" + "zip2 v16.4s, v25.4s, v22.4s\n" + "str q24, [%x[out_ptr], #0x20]\n" + "zip1 v18.4s, v19.4s, v16.4s\n" + "uadalp v1.8h, v24.16b\n" + "zip2 v17.4s, v27.4s, v23.4s\n" + "str q20, [%x[out_ptr], #0x30]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "uadalp v0.8h, v20.16b\n" + "str q17, [%x[out_ptr], #0x60]\n" + "uadalp v1.8h, v21.16b\n" + "str q16, [%x[out_ptr], #0x70]\n" + "subs %x[width], %x[width], #0x10\n" + "uadalp v0.8h, v18.16b\n" + "cmp %x[width], #0x10\n" + "uadalp v1.8h, v17.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "uadalp v0.8h, v16.16b\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 14f\n" + "tbz %x[width], #3, 9f\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d22, [x20], #0x8\n" + "tbz %x[width], #2, 7f\n" + "ld1 { v29.s }[2], [x27], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x20], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v29.h }[6], [x27], #0x2\n" + "ld1 { v28.h }[6], [x26], #0x2\n" + "ld1 { v27.h }[6], [x25], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v26.h }[6], [x23], #0x2\n" + "ld1 { v25.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v22.h }[6], [x20], #0x2\n" + "mov x19, #0x4\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[14], [x27]\n" + "ld1 { v28.b }[14], [x26]\n" + "ld1 { v27.b }[14], [x25]\n" + "ld1 { v21.b }[14], [x24]\n" + "ld1 { v26.b }[14], [x23]\n" + "ld1 { v25.b }[14], [x22]\n" + "ld1 { v19.b }[14], [x21]\n" + "ld1 { v22.b }[14], [x20]\n" + "b 13f\n" + "6:" // odd_loads_1_12 + "mov x19, #0x3\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[12], [x27]\n" + "ld1 { v28.b }[12], [x26]\n" + "ld1 { v27.b }[12], [x25]\n" + "ld1 { v21.b }[12], [x24]\n" + "ld1 { v26.b }[12], [x23]\n" + "ld1 { v25.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" + "ld1 { v22.b }[12], [x20]\n" + "mov x19, #0x4\n" + "b 13f\n" + "7:" // odd_loads_2_8 + "tbz %x[width], #1, 8f\n" + "ld1 { v29.h }[4], [x27], #0x2\n" + "ld1 { v28.h }[4], [x26], #0x2\n" + "ld1 { v27.h }[4], [x25], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v26.h }[4], [x23], #0x2\n" + "ld1 { v25.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v22.h }[4], [x20], #0x2\n" + "mov x19, #0x3\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[10], [x27]\n" + "ld1 { v28.b }[10], [x26]\n" + "ld1 { v27.b }[10], [x25]\n" + "ld1 { v21.b }[10], [x24]\n" + "ld1 { v26.b }[10], [x23]\n" + "ld1 { v25.b }[10], [x22]\n" + "ld1 { v19.b }[10], [x21]\n" + "ld1 { v22.b }[10], [x20]\n" + "b 13f\n" + "8:" // odd_loads_1_8 + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[8], [x27]\n" + "ld1 { v28.b }[8], [x26]\n" + "ld1 { v27.b }[8], [x25]\n" + "ld1 { v21.b }[8], [x24]\n" + "ld1 { v26.b }[8], [x23]\n" + "ld1 { v25.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" + "ld1 { v22.b }[8], [x20]\n" + "mov x19, #0x3\n" + "b 13f\n" + "9:" // odd_loads_4_0 + "tbz %x[width], #2, 11f\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s22, [x20], #0x4\n" + "tbz %x[width], #1, 10f\n" + "ld1 { v29.h }[2], [x27], #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v26.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v22.h }[2], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[6], [x27]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v26.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v19.b }[6], [x21]\n" + "ld1 { v22.b }[6], [x20]\n" + "b 13f\n" + "10:" // odd_loads_1_4 + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[4], [x27]\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v26.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" + "ld1 { v22.b }[4], [x20]\n" + "mov x19, #0x2\n" + "b 13f\n" + "11:" // odd_loads_2_0 + "tbz %x[width], #1, 12f\n" + "ldr h29, [x27], #0x2\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h26, [x23], #0x2\n" + "ldr h25, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" + "ldr h22, [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v29.b }[2], [x27]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v26.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v19.b }[2], [x21]\n" + "ld1 { v22.b }[2], [x20]\n" + "b 13f\n" + "12:" // odd_loads_1_0 + "ldr b29, [x27, #0x0]\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b26, [x23, #0x0]\n" + "ldr b25, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" + "ldr b22, [x20, #0x0]\n" + "mov x19, #0x1\n" + "13:" // Odd load end + "zip1 v23.4s, v29.4s, v27.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v20.4s, v28.4s, v21.4s\n" + "zip1 v16.4s, v23.4s, v20.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v26.4s, v19.4s\n" + "uadalp v1.8h, v16.16b\n" + "zip1 v17.4s, v25.4s, v22.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" + "uadalp v0.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 14f\n" + "zip2 v24.4s, v23.4s, v20.4s\n" + "zip2 v20.4s, v18.4s, v17.4s\n" + "str q24, [%x[out_ptr], #0x0]\n" + "uadalp v1.8h, v24.16b\n" + "str q20, [%x[out_ptr], #0x10]\n" + "uadalp v0.8h, v20.16b\n" + "subs x19, x19, #0x1\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 14f\n" + "zip2 v27.4s, v29.4s, v27.4s\n" + "zip2 v23.4s, v28.4s, v21.4s\n" + "subs x19, x19, #0x1\n" + "zip1 v21.4s, v27.4s, v23.4s\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v26.4s, v19.4s\n" + "uadalp v1.8h, v21.16b\n" + "zip2 v16.4s, v25.4s, v22.4s\n" + "zip1 v18.4s, v19.4s, v16.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" + "uadalp v0.8h, v18.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "beq 14f\n" + "zip2 v17.4s, v27.4s, v23.4s\n" + "zip2 v16.4s, v19.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "uadalp v1.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "uadalp v0.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + "14:" // Odds skip + "uadalp v31.4s, v1.8h\n" + "uadalp v30.4s, v0.8h\n" + "str q31, [%x[out_ptr], #0x0]\n" + "str q30, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp new file mode 100644 index 0000000000..704a4c9210 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 8, VLType::None, false>( + int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + __asm__ __volatile__( + "ldr x27, [%x[in], #0x0]\n" + "cmp %x[height], #0x8\n" + "ldr x26, [%x[in], #0x8]\n" + "add x27, x27, %x[row_offset]\n" + "ldr x25, [%x[in], #0x10]\n" + "ldr x24, [%x[in], #0x18]\n" + "add x26, x26, %x[row_offset]\n" + "ldr x23, [%x[in], #0x20]\n" + "add x25, x25, %x[row_offset]\n" + "ldr x22, [%x[in], #0x28]\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "prfm pldl1keep, [x27, #0x0]\n" + "cmp %x[width], #0x10\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "blt 3f\n" + "2:" // Main loop head + "ldr q27, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q24, [x26], #0x10\n" + "zip1 v26.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q25, [x25], #0x10\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q21, [x24], #0x10\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q22, [x23], #0x10\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q18, [x22], #0x10\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x20], #0x10\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "str q20, [%x[out_ptr], #0x20]\n" + "str q17, [%x[out_ptr], #0x30]\n" + "str q24, [%x[out_ptr], #0x40]\n" + "str q21, [%x[out_ptr], #0x50]\n" + "str q18, [%x[out_ptr], #0x60]\n" + "str q16, [%x[out_ptr], #0x70]\n" + "subs %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 2b\n" + "3:" // Main loop skip + "cbz %x[width], 12f\n" + "tbz %x[width], #3, 7f\n" + "ldr d27, [x27], #0x8\n" + "ldr d24, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "tbz %x[width], #2, 5f\n" + "ld1 { v27.s }[2], [x27], #0x4\n" + "ld1 { v24.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v18.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" + "tbz %x[width], #1, 4f\n" + "ld1 { v27.h }[6], [x27], #0x2\n" + "ld1 { v24.h }[6], [x26], #0x2\n" + "ld1 { v25.h }[6], [x25], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v18.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v27.b }[14], [x27]\n" + "ld1 { v24.b }[14], [x26]\n" + "ld1 { v25.b }[14], [x25]\n" + "ld1 { v21.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v18.b }[14], [x22]\n" + "ld1 { v19.b }[14], [x21]\n" + "ld1 { v16.b }[14], [x20]\n" + "b 11f\n" + "4:" // odd_loads_1_12 + "mov x19, #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v27.b }[12], [x27]\n" + "ld1 { v24.b }[12], [x26]\n" + "ld1 { v25.b }[12], [x25]\n" + "ld1 { v21.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v18.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" + "ld1 { v16.b }[12], [x20]\n" + "b 11f\n" + "5:" // odd_loads_2_8 + "tbz %x[width], #1, 6f\n" + "ld1 { v27.h }[4], [x27], #0x2\n" + "ld1 { v24.h }[4], [x26], #0x2\n" + "ld1 { v25.h }[4], [x25], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v18.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v27.b }[10], [x27]\n" + "ld1 { v24.b }[10], [x26]\n" + "ld1 { v25.b }[10], [x25]\n" + "ld1 { v21.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v18.b }[10], [x22]\n" + "ld1 { v19.b }[10], [x21]\n" + "ld1 { v16.b }[10], [x20]\n" + "b 11f\n" + "6:" // odd_loads_1_8 + "mov x19, #0x1\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v27.b }[8], [x27]\n" + "ld1 { v24.b }[8], [x26]\n" + "ld1 { v25.b }[8], [x25]\n" + "ld1 { v21.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v18.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" + "ld1 { v16.b }[8], [x20]\n" + "mov x19, #0x2\n" + "b 11f\n" + "7:" // odd_loads_4_0 + "tbz %x[width], #2, 9f\n" + "ldr s27, [x27], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s18, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "tbz %x[width], #1, 8f\n" + "ld1 { v27.h }[2], [x27], #0x2\n" + "ld1 { v24.h }[2], [x26], #0x2\n" + "ld1 { v25.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v18.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v27.b }[6], [x27]\n" + "ld1 { v24.b }[6], [x26]\n" + "ld1 { v25.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v18.b }[6], [x22]\n" + "ld1 { v19.b }[6], [x21]\n" + "ld1 { v16.b }[6], [x20]\n" + "b 11f\n" + "8:" // odd_loads_1_4 + "mov x19, #0x1\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v27.b }[4], [x27]\n" + "ld1 { v24.b }[4], [x26]\n" + "ld1 { v25.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v18.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" + "ld1 { v16.b }[4], [x20]\n" + "b 11f\n" + "9:" // odd_loads_2_0 + "tbz %x[width], #1, 10f\n" + "ldr h27, [x27], #0x2\n" + "ldr h24, [x26], #0x2\n" + "ldr h25, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h18, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 11f\n" + "ld1 { v27.b }[2], [x27]\n" + "ld1 { v24.b }[2], [x26]\n" + "ld1 { v25.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v18.b }[2], [x22]\n" + "ld1 { v19.b }[2], [x21]\n" + "ld1 { v16.b }[2], [x20]\n" + "b 11f\n" + "10:" // odd_loads_1_0 + "ldr b27, [x27, #0x0]\n" + "ldr b24, [x26, #0x0]\n" + "ldr b25, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b18, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" + "ldr b16, [x20, #0x0]\n" + "mov x19, #0x1\n" + "11:" // Odd load end + "zip1 v26.2d, v27.2d, v24.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "str q20, [%x[out_ptr], #0x20]\n" + "str q17, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "beq 12f\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "str q24, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "str q21, [%x[out_ptr], #0x10]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q18, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "12:" // Odds skip + + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + +template<> +void interleave_block<8, 8, VLType::None, false>( + uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool +) +{ + int8_t * &out_cast = reinterpret_cast(out_ptr); + const int8_t * const * in_cast = reinterpret_cast(in); + + interleave_block<8, 8, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp new file mode 100644 index 0000000000..2317ece790 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 8, VLType::None, true>( + int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v5.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v4.8h, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v3.8h, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset]\n" + "movi v2.8h, #0x0\n" + "ldr x24, [%x[in], #0x18]\n" + "movi v1.4s, #0x0\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset]\n" + "movi v0.4s, #0x0\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset]\n" + "movi v31.4s, #0x0\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "movi v30.4s, #0x0\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "movi v29.4s, #0x0\n" + "prfm pldl1keep, [x27, #0x0]\n" + "movi v28.4s, #0x0\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v29.4s }, [%x[out_ptr]]\n" + "ldr q28, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x10\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0x3e\n" + "ble 4f\n" + "sadalp v1.4s, v5.8h\n" + "movi v5.8h, #0x0\n" + "sadalp v0.4s, v4.8h\n" + "movi v4.8h, #0x0\n" + "sadalp v31.4s, v3.8h\n" + "movi v3.8h, #0x0\n" + "sadalp v30.4s, v2.8h\n" + "movi v2.8h, #0x0\n" + "mov x19, #0x0\n" + "4:" // no_accumulate_16 + "ldr q27, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q24, [x26], #0x10\n" + "zip1 v26.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q25, [x25], #0x10\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q21, [x24], #0x10\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q22, [x23], #0x10\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q18, [x22], #0x10\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x20], #0x10\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "sadalp v5.8h, v26.16b\n" + "str q23, [%x[out_ptr], #0x10]\n" + "sadalp v4.8h, v23.16b\n" + "str q20, [%x[out_ptr], #0x20]\n" + "sadalp v3.8h, v20.16b\n" + "str q17, [%x[out_ptr], #0x30]\n" + "sadalp v2.8h, v17.16b\n" + "str q24, [%x[out_ptr], #0x40]\n" + "sadalp v5.8h, v24.16b\n" + "str q21, [%x[out_ptr], #0x50]\n" + "sadalp v4.8h, v21.16b\n" + "str q18, [%x[out_ptr], #0x60]\n" + "sadalp v3.8h, v18.16b\n" + "str q16, [%x[out_ptr], #0x70]\n" + "sadalp v2.8h, v16.16b\n" + "add x19, x19, #0x1\n" + "subs %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 14f\n" + "tbz %x[width], #3, 9f\n" + "ldr d27, [x27], #0x8\n" + "ldr d24, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "tbz %x[width], #2, 7f\n" + "ld1 { v27.s }[2], [x27], #0x4\n" + "ld1 { v24.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v18.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v27.h }[6], [x27], #0x2\n" + "ld1 { v24.h }[6], [x26], #0x2\n" + "ld1 { v25.h }[6], [x25], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v18.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[14], [x27]\n" + "ld1 { v24.b }[14], [x26]\n" + "ld1 { v25.b }[14], [x25]\n" + "ld1 { v21.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v18.b }[14], [x22]\n" + "ld1 { v19.b }[14], [x21]\n" + "ld1 { v16.b }[14], [x20]\n" + "b 13f\n" + "6:" // odd_loads_1_12 + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[12], [x27]\n" + "ld1 { v24.b }[12], [x26]\n" + "ld1 { v25.b }[12], [x25]\n" + "ld1 { v21.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v18.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" + "ld1 { v16.b }[12], [x20]\n" + "b 13f\n" + "7:" // odd_loads_2_8 + "tbz %x[width], #1, 8f\n" + "ld1 { v27.h }[4], [x27], #0x2\n" + "ld1 { v24.h }[4], [x26], #0x2\n" + "ld1 { v25.h }[4], [x25], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v18.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[10], [x27]\n" + "ld1 { v24.b }[10], [x26]\n" + "ld1 { v25.b }[10], [x25]\n" + "ld1 { v21.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v18.b }[10], [x22]\n" + "ld1 { v19.b }[10], [x21]\n" + "ld1 { v16.b }[10], [x20]\n" + "b 13f\n" + "8:" // odd_loads_1_8 + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[8], [x27]\n" + "ld1 { v24.b }[8], [x26]\n" + "ld1 { v25.b }[8], [x25]\n" + "ld1 { v21.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v18.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" + "ld1 { v16.b }[8], [x20]\n" + "mov x19, #0x2\n" + "b 13f\n" + "9:" // odd_loads_4_0 + "tbz %x[width], #2, 11f\n" + "ldr s27, [x27], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s18, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "tbz %x[width], #1, 10f\n" + "ld1 { v27.h }[2], [x27], #0x2\n" + "ld1 { v24.h }[2], [x26], #0x2\n" + "ld1 { v25.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v18.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[6], [x27]\n" + "ld1 { v24.b }[6], [x26]\n" + "ld1 { v25.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v18.b }[6], [x22]\n" + "ld1 { v19.b }[6], [x21]\n" + "ld1 { v16.b }[6], [x20]\n" + "b 13f\n" + "10:" // odd_loads_1_4 + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[4], [x27]\n" + "ld1 { v24.b }[4], [x26]\n" + "ld1 { v25.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v18.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" + "ld1 { v16.b }[4], [x20]\n" + "b 13f\n" + "11:" // odd_loads_2_0 + "tbz %x[width], #1, 12f\n" + "ldr h27, [x27], #0x2\n" + "ldr h24, [x26], #0x2\n" + "ldr h25, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h18, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[2], [x27]\n" + "ld1 { v24.b }[2], [x26]\n" + "ld1 { v25.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v18.b }[2], [x22]\n" + "ld1 { v19.b }[2], [x21]\n" + "ld1 { v16.b }[2], [x20]\n" + "b 13f\n" + "12:" // odd_loads_1_0 + "ldr b27, [x27, #0x0]\n" + "ldr b24, [x26, #0x0]\n" + "ldr b25, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b18, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" + "ldr b16, [x20, #0x0]\n" + "mov x19, #0x1\n" + "13:" // Odd load end + "zip1 v26.2d, v27.2d, v24.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "sadalp v5.8h, v26.16b\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "sadalp v4.8h, v23.16b\n" + "str q20, [%x[out_ptr], #0x20]\n" + "sadalp v3.8h, v20.16b\n" + "str q17, [%x[out_ptr], #0x30]\n" + "sadalp v2.8h, v17.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "beq 14f\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "str q24, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "sadalp v5.8h, v24.16b\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q21, [%x[out_ptr], #0x10]\n" + "sadalp v4.8h, v21.16b\n" + "str q18, [%x[out_ptr], #0x20]\n" + "sadalp v3.8h, v18.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "sadalp v2.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "14:" // Odds skip + "sadalp v1.4s, v5.8h\n" + "sadalp v0.4s, v4.8h\n" + "addp v1.4s, v1.4s, v0.4s\n" + "sadalp v31.4s, v3.8h\n" + "sadalp v30.4s, v2.8h\n" + "add v1.4s, v1.4s, v29.4s\n" + "str q1, [%x[out_ptr], #0x0]\n" + "addp v0.4s, v31.4s, v30.4s\n" + "add v0.4s, v0.4s, v28.4s\n" + "str q0, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp new file mode 100644 index 0000000000..07164d6b24 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +template<> +void interleave_block<8, 8, VLType::None, true>( + uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height, + size_t row_offset, bool first +) +{ + __asm__ __volatile__( + "movi v5.8h, #0x0\n" + "ldr x27, [%x[in], #0x0]\n" + "mov x19, #0x0\n" + "movi v4.8h, #0x0\n" + "ldr x26, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "movi v3.8h, #0x0\n" + "ldr x25, [%x[in], #0x10]\n" + "add x27, x27, %x[row_offset]\n" + "movi v2.8h, #0x0\n" + "ldr x24, [%x[in], #0x18]\n" + "movi v1.4s, #0x0\n" + "ldr x23, [%x[in], #0x20]\n" + "add x26, x26, %x[row_offset]\n" + "movi v0.4s, #0x0\n" + "ldr x22, [%x[in], #0x28]\n" + "add x25, x25, %x[row_offset]\n" + "movi v31.4s, #0x0\n" + "ldr x21, [%x[in], #0x30]\n" + "add x24, x24, %x[row_offset]\n" + "movi v30.4s, #0x0\n" + "ldr x20, [%x[in], #0x38]\n" + "add x23, x23, %x[row_offset]\n" + "add x22, x22, %x[row_offset]\n" + "add x21, x21, %x[row_offset]\n" + "add x20, x20, %x[row_offset]\n" + "beq 1f\n" + "mov x20, x27\n" + "cmp %x[height], #0x2\n" + "csel x26, x26, x27, GE\n" + "csel x25, x25, x27, GT\n" + "cmp %x[height], #0x4\n" + "csel x24, x24, x27, GE\n" + "csel x23, x23, x27, GT\n" + "cmp %x[height], #0x6\n" + "csel x22, x22, x27, GE\n" + "csel x21, x21, x27, GT\n" + "1:" // no_pointer_adj + "movi v29.4s, #0x0\n" + "prfm pldl1keep, [x27, #0x0]\n" + "movi v28.4s, #0x0\n" + "prfm pldl1keep, [x26, #0x0]\n" + "prfm pldl1keep, [x25, #0x0]\n" + "prfm pldl1keep, [x24, #0x0]\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" + "prfm pldl1keep, [x21, #0x0]\n" + "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x27, #0x40]\n" + "prfm pldl1keep, [x26, #0x40]\n" + "prfm pldl1keep, [x25, #0x40]\n" + "prfm pldl1keep, [x24, #0x40]\n" + "prfm pldl1keep, [x23, #0x40]\n" + "prfm pldl1keep, [x22, #0x40]\n" + "prfm pldl1keep, [x21, #0x40]\n" + "prfm pldl1keep, [x20, #0x40]\n" + "cbnz %w[first], 2f\n" + "sub %x[out_ptr], %x[out_ptr], #0x20\n" + "ld1 { v29.4s }, [%x[out_ptr]]\n" + "ldr q28, [%x[out_ptr], #0x10]\n" + "2:" // first_pass + "cmp %x[width], #0x10\n" + "blt 5f\n" + "3:" // Main loop head + "cmp x19, #0x3e\n" + "ble 4f\n" + "uadalp v1.4s, v5.8h\n" + "movi v5.8h, #0x0\n" + "uadalp v0.4s, v4.8h\n" + "movi v4.8h, #0x0\n" + "uadalp v31.4s, v3.8h\n" + "movi v3.8h, #0x0\n" + "uadalp v30.4s, v2.8h\n" + "movi v2.8h, #0x0\n" + "mov x19, #0x0\n" + "4:" // no_accumulate_16 + "ldr q27, [x27], #0x10\n" + "prfm pldl1keep, [x27, #0x70]\n" + "ldr q24, [x26], #0x10\n" + "zip1 v26.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "ldr q25, [x25], #0x10\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "ldr q21, [x24], #0x10\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" + "ldr q22, [x23], #0x10\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "prfm pldl1keep, [x23, #0x70]\n" + "ldr q18, [x22], #0x10\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x22, #0x70]\n" + "ldr q19, [x21], #0x10\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "prfm pldl1keep, [x21, #0x70]\n" + "ldr q16, [x20], #0x10\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x20, #0x70]\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "uadalp v5.8h, v26.16b\n" + "str q23, [%x[out_ptr], #0x10]\n" + "uadalp v4.8h, v23.16b\n" + "str q20, [%x[out_ptr], #0x20]\n" + "uadalp v3.8h, v20.16b\n" + "str q17, [%x[out_ptr], #0x30]\n" + "uadalp v2.8h, v17.16b\n" + "str q24, [%x[out_ptr], #0x40]\n" + "uadalp v5.8h, v24.16b\n" + "str q21, [%x[out_ptr], #0x50]\n" + "uadalp v4.8h, v21.16b\n" + "str q18, [%x[out_ptr], #0x60]\n" + "uadalp v3.8h, v18.16b\n" + "str q16, [%x[out_ptr], #0x70]\n" + "uadalp v2.8h, v16.16b\n" + "add x19, x19, #0x1\n" + "subs %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "add %x[out_ptr], %x[out_ptr], #0x80\n" + "bge 3b\n" + "5:" // Main loop skip + "cbz %x[width], 14f\n" + "tbz %x[width], #3, 9f\n" + "ldr d27, [x27], #0x8\n" + "ldr d24, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "tbz %x[width], #2, 7f\n" + "ld1 { v27.s }[2], [x27], #0x4\n" + "ld1 { v24.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v18.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" + "tbz %x[width], #1, 6f\n" + "ld1 { v27.h }[6], [x27], #0x2\n" + "ld1 { v24.h }[6], [x26], #0x2\n" + "ld1 { v25.h }[6], [x25], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v18.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[14], [x27]\n" + "ld1 { v24.b }[14], [x26]\n" + "ld1 { v25.b }[14], [x25]\n" + "ld1 { v21.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v18.b }[14], [x22]\n" + "ld1 { v19.b }[14], [x21]\n" + "ld1 { v16.b }[14], [x20]\n" + "b 13f\n" + "6:" // odd_loads_1_12 + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[12], [x27]\n" + "ld1 { v24.b }[12], [x26]\n" + "ld1 { v25.b }[12], [x25]\n" + "ld1 { v21.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v18.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" + "ld1 { v16.b }[12], [x20]\n" + "b 13f\n" + "7:" // odd_loads_2_8 + "tbz %x[width], #1, 8f\n" + "ld1 { v27.h }[4], [x27], #0x2\n" + "ld1 { v24.h }[4], [x26], #0x2\n" + "ld1 { v25.h }[4], [x25], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v18.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" + "mov x19, #0x2\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[10], [x27]\n" + "ld1 { v24.b }[10], [x26]\n" + "ld1 { v25.b }[10], [x25]\n" + "ld1 { v21.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v18.b }[10], [x22]\n" + "ld1 { v19.b }[10], [x21]\n" + "ld1 { v16.b }[10], [x20]\n" + "b 13f\n" + "8:" // odd_loads_1_8 + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[8], [x27]\n" + "ld1 { v24.b }[8], [x26]\n" + "ld1 { v25.b }[8], [x25]\n" + "ld1 { v21.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v18.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" + "ld1 { v16.b }[8], [x20]\n" + "mov x19, #0x2\n" + "b 13f\n" + "9:" // odd_loads_4_0 + "tbz %x[width], #2, 11f\n" + "ldr s27, [x27], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s18, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "tbz %x[width], #1, 10f\n" + "ld1 { v27.h }[2], [x27], #0x2\n" + "ld1 { v24.h }[2], [x26], #0x2\n" + "ld1 { v25.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v18.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[6], [x27]\n" + "ld1 { v24.b }[6], [x26]\n" + "ld1 { v25.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v18.b }[6], [x22]\n" + "ld1 { v19.b }[6], [x21]\n" + "ld1 { v16.b }[6], [x20]\n" + "b 13f\n" + "10:" // odd_loads_1_4 + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[4], [x27]\n" + "ld1 { v24.b }[4], [x26]\n" + "ld1 { v25.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v18.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" + "ld1 { v16.b }[4], [x20]\n" + "b 13f\n" + "11:" // odd_loads_2_0 + "tbz %x[width], #1, 12f\n" + "ldr h27, [x27], #0x2\n" + "ldr h24, [x26], #0x2\n" + "ldr h25, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h18, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "mov x19, #0x1\n" + "tbz %x[width], #0, 13f\n" + "ld1 { v27.b }[2], [x27]\n" + "ld1 { v24.b }[2], [x26]\n" + "ld1 { v25.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v18.b }[2], [x22]\n" + "ld1 { v19.b }[2], [x21]\n" + "ld1 { v16.b }[2], [x20]\n" + "b 13f\n" + "12:" // odd_loads_1_0 + "ldr b27, [x27, #0x0]\n" + "ldr b24, [x26, #0x0]\n" + "ldr b25, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b18, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" + "ldr b16, [x20, #0x0]\n" + "mov x19, #0x1\n" + "13:" // Odd load end + "zip1 v26.2d, v27.2d, v24.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v23.2d, v25.2d, v21.2d\n" + "str q26, [%x[out_ptr], #0x0]\n" + "zip1 v20.2d, v22.2d, v18.2d\n" + "uadalp v5.8h, v26.16b\n" + "zip1 v17.2d, v19.2d, v16.2d\n" + "str q23, [%x[out_ptr], #0x10]\n" + "uadalp v4.8h, v23.16b\n" + "str q20, [%x[out_ptr], #0x20]\n" + "uadalp v3.8h, v20.16b\n" + "str q17, [%x[out_ptr], #0x30]\n" + "uadalp v2.8h, v17.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "beq 14f\n" + "zip2 v24.2d, v27.2d, v24.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "str q24, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v22.2d, v18.2d\n" + "uadalp v5.8h, v24.16b\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "str q21, [%x[out_ptr], #0x10]\n" + "uadalp v4.8h, v21.16b\n" + "str q18, [%x[out_ptr], #0x20]\n" + "uadalp v3.8h, v18.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "uadalp v2.8h, v16.16b\n" + "add %x[out_ptr], %x[out_ptr], #0x40\n" + "14:" // Odds skip + "uadalp v1.4s, v5.8h\n" + "uadalp v0.4s, v4.8h\n" + "addp v1.4s, v1.4s, v0.4s\n" + "uadalp v31.4s, v3.8h\n" + "uadalp v30.4s, v2.8h\n" + "add v1.4s, v1.4s, v29.4s\n" + "str q1, [%x[out_ptr], #0x0]\n" + "addp v0.4s, v31.4s, v30.4s\n" + "add v0.4s, v0.4s, v28.4s\n" + "str q0, [%x[out_ptr], #0x10]\n" + "add %x[out_ptr], %x[out_ptr], #0x20\n" + : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); +} + + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp new file mode 100644 index 0000000000..52b49c0f0c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "a32_interleave6_block1_fp32_fp32.hpp" +#include "a64_interleave4_block16_s8_s8.hpp" +#include "a64_interleave4_block16_s8_s8_summing.hpp" +#include "a64_interleave4_block16_u8_u8_summing.hpp" +#include "a64_interleave8_block1_bf16_fp32.hpp" +#include "a64_interleave8_block1_fp16_fp16.hpp" +#include "a64_interleave8_block1_fp16_fp32.hpp" +#include "a64_interleave8_block1_fp32_fp32.hpp" +#include "a64_interleave8_block1_s16_s16.hpp" +#include "a64_interleave8_block1_s16_s16_summing.hpp" +#include "a64_interleave8_block1_s8_s16.hpp" +#include "a64_interleave8_block1_s8_s16_summing.hpp" +#include "a64_interleave8_block1_u16_u16_summing.hpp" +#include "a64_interleave8_block1_u8_u16.hpp" +#include "a64_interleave8_block1_u8_u16_summing.hpp" +#include "a64_interleave8_block2_bf16_bf16.hpp" +#include "a64_interleave8_block2_fp32_fp32.hpp" +#include "a64_interleave8_block4_bf16_bf16.hpp" +#include "a64_interleave8_block4_s8_s8.hpp" +#include "a64_interleave8_block4_s8_s8_summing.hpp" +#include "a64_interleave8_block4_u8_u8_summing.hpp" +#include "a64_interleave8_block8_s8_s8.hpp" +#include "a64_interleave8_block8_s8_s8_summing.hpp" +#include "a64_interleave8_block8_u8_u8_summing.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp new file mode 100644 index 0000000000..2b3e170a3b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "asmlib.hpp" +#include "convolution_parameters.hpp" +#include "convolver.hpp" +#include "interleave_indirect.hpp" +#include "bfloat.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "utils.hpp" + +namespace arm_gemm { + +/* + * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together. + * + * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining + * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad + * with a particular value. + * + * Note that it is not expected for this templated version to ever be used - all cases that matter should be + * explicitly specialized with an optimized implementation. + */ +template +void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) { + const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : 1); + + std::vector the_sums; + + if (integrate_sums) { + the_sums = std::vector(int_by, 0); + + if (!first) { + // In 'integrate sums' mode, we dump the sums at the end on each pass. + + // On the last pass this is correct, but on other passes it is not - + // so on the subsequent pass we need to take the output written by + // the previous pass as starting point for the sums, and then + // overwrite them with new interleaved data. + int32_t *out_int32 = reinterpret_cast(out); + + // Rewind pointer to where we wrote out the sums last time. + out_int32 -= int_by; + + // Restore the running sums. + memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t)); + + // Update the "real" pointer so that the next output will clobber the old sums. + out = reinterpret_cast(out_int32); + } + } + + for (unsigned int pos=0; pos= height) { + for (unsigned int col=0; col= width) { + *out++ = 0; + continue; + } + + if (integrate_sums) { + the_sums[row] += in[row][row_offset + pos + col]; + } + + *out++ = in[row][row_offset + pos + col]; + } + } + } + + if (integrate_sums) { + int32_t *out_int32 = reinterpret_cast(out); + + memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t)); + + out = reinterpret_cast(out_int32 + int_by); + } +} + +template +inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : 1); + + // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not. + if (row_sum_multiplier) { + // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the + // next block (post sums). + // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'. + int32_t *out_int32 = reinterpret_cast(out); + + out_int32 -= height; + for (unsigned int i=0; i() will *not* have done the sums, so 'out' will point to the start of the + // sum block. We need to insert the (zero) sums, and advance 'out'. + int32_t *out_int32 = reinterpret_cast(out); + + for (unsigned int i=0; i(out_int32); + } +} + +template +void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen, + unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, + const unsigned int k0, const unsigned int kmax, bool integrate_sums, + const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : 1); + + // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input + // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for + // out of range rows). This allows interleave_block to use techniques like row predication, or loading all + // pointers and conditionally overriding the out of range ones. + + // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of + // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be + // expensive in highly threaded scenarios. + const TIn **row_ptrs = reinterpret_cast(alloca(height * sizeof(const TIn *))); + + // Figure out the starting position based on k0 (with rounded length) + unsigned int start_string = k0 / rounded_stringlen; + unsigned int start_stringpos = k0 % rounded_stringlen; + + // Process blocks of 'height' height... + for (unsigned int ybase = y0; ybase < ymax; ybase+=height) { + // Height to process + unsigned int active_height = std::min(ymax - ybase, height); + + // Track our progress through the various strings + unsigned int k_left = (kmax - k0); + unsigned int string = start_string; + unsigned int stringpos = start_stringpos; + + bool first = true; + + // Prepare to call 'interleave_block' above for each string encompassed by K range + while (k_left > 0) { + // Width to process - and the width we will generate (with padding) + unsigned int in_width = std::min(k_left, stringlen - stringpos); + unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos); + + const TIn * const *row_base = ptr[string] + ybase; + + // If not all rows are valid, copy the ones that are into local array (see above comment). + if (active_height < height) { + for (unsigned int i=0; i::value && integrate_sums && row_sum_multiplier) { + interleave_block(out, row_base, in_width, active_height, stringpos, first); + } else { + interleave_block(out, row_base, in_width, active_height, stringpos, first); + } + + k_left -= out_width; + string++; + stringpos=0; + first=false; + } + + if (std::is_integral::value && integrate_sums) { + FixupRowSums(out, row_sum_multiplier); + } + } +} + +template +void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver &conv, const unsigned int rounded_stringlen, + const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : 1); + + auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen); + + // Use alloca here as a std::vector can be expensive in highly threaded scenarios. + const TIn **row_ptrs = reinterpret_cast(alloca(height * sizeof(const TIn *))); + + for (unsigned int ybase = y0; ybase < ymax; ybase += height) { + // How many of the rows are active - the rest will get padded in interleave_block. + unsigned int active_height = std::min(ymax - ybase, height); + bool first = true; + + auto conv_rows = conv_cols.process_rows(ybase, active_height); + + while (!conv_rows.finished()) { + unsigned int width, offset; + + // Get next set of parameters + std::tie(width, offset) = conv_rows.next_block(row_ptrs); + + // Perform the interleave + if (std::is_integral::value && integrate_sums && row_sum_multiplier) { + interleave_block(out, row_ptrs, width, active_height, offset, first); + } else { + interleave_block(out, row_ptrs, width, active_height, offset, first); + } + + first=false; + } + + if (std::is_integral::value && integrate_sums) { + FixupRowSums(out, row_sum_multiplier); + } + } +} + +template +void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : 1); + + // Use alloca here as a std::vector can be expensive in highly threaded scenarios. + const TIn **row_ptrs = reinterpret_cast(alloca(height * sizeof(const TIn *))); + + const unsigned int width=kmax-k0; + + for (unsigned int y=y0; y::value && integrate_sums && row_sum_multiplier) { + interleave_block(out, row_ptrs, width, std::min(height, ymax-y), k0, true); + } else { + interleave_block(out, row_ptrs, width, std::min(height, ymax-y), k0, true); + } + + if (std::is_integral::value && integrate_sums) { + FixupRowSums(out, row_sum_multiplier); + } + } +} + +#include "indirect-interleaves/list.hpp" + +/**** Instantiate needed implementations ****/ + +/* AArch32 */ +#ifdef __arm__ +/* FP32 */ +/* NEON implementation (height 6) */ +template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* FP16 */ +#if __ARM_FP16_ARGS +/* NEON implementation using FP32 kernel (height 6) */ +template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +#endif /* __ARM_FP16_ARGS */ + +/* BF16 */ +/* NEON implementation using FP32 kernel */ +template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +#endif + +/* AArch64 */ +#ifdef __aarch64__ +/* FP64 */ +/* NEON/SVE implementation (height 8) */ +template void IndirectInterleave<8, 1, VLType::None>(double *, const double * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(double *, const double *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(double *, const double *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* FP32 */ +/* NEON/SVE implementation (height 8) */ +template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* FMMLA */ +template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* FP16 */ +template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* BF16 */ +/* NEON/SVE BFDOT */ +template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* NEON/SVE using FP32 kernel */ +template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* INT16 */ +template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* INT8 */ +/* NEON SMLA/SMLAL (height 4, block 16) */ +template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* NEON SDOT (height 8, block 4) */ +template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* MMLA SMMLA (height 8, block 8) */ +template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* NEON SDOT (height 8, block 1) */ +template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* NEON SMLA/SMLAL (height 4, block 16) */ +template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* NEON SDOT (height 8, block 4) */ +template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* MMLA SMMLA (height 8, block 8) */ +template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); + +/* NEON 16-bit (height 8, block 1) */ +template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); +template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); +#endif // __aarch64__ + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp new file mode 100644 index 0000000000..660577f0e3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "convolution_parameters.hpp" +#include "convolver.hpp" +#include "utils.hpp" + +namespace arm_gemm { + +template +void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t); + +template +void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t); + +template +void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t); + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp deleted file mode 100644 index 0f0e5a7ed4..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int); - -// 12x8 SGEMM "strategy" class. -// -// This describes the characteristics of a family of kernels, in terms of -// the required interleave properties and the output block size. -// -// All kernels in the family must share these characteristics. The actual -// kernel to be used can be chosen at runtime, based on the CPU_type -// structure. -class gemm_s16_12x8 { -public: - typedef int16_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() { - return 12; - } - - static unsigned int out_height() { - return 8; - } - - static unsigned int k_unroll() { - return 1; - } - - // Use the standard fixed size transforms. - StdTransformsFixed transforms = {}; - - kern_type kernel = a64_gemm_s16_asimd_12x8; - - gemm_s16_12x8(const CPUInfo *) { } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp deleted file mode 100644 index 7052f83a3d..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) -{ - const int16_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - - for (int yb = 0; yb < ablocks; yb++) - { - const int16_t *a_ptr0 = a_ptr; - const int16_t *b_ptr = Bpanel; - - for (int xb = 0; xb < bblocks; xb++) - { - a_ptr = a_ptr0; - const bool odd_k = K & 0x1; - int k = (K+1)/2 - 1; - - register int16x8_t aa asm("v0"); - register int16x8_t ab asm("v1"); - register int16x8_t b0 asm("v2"); - register int16x8_t b1 asm("v3"); - register int16x8_t b2 asm("v4"); - - __asm __volatile ( - "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower - "movi v5.4s, #0\n" - "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper - "movi v6.4s, #0\n" - "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper - "movi v7.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v8.4s, #0\n" - "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper - "movi v9.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v10.4s, #0\n" - "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper - "movi v11.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #96]") - "movi v12.4s, #0\n" - "movi v13.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #96]") - "movi v14.4s, #0\n" - "movi v15.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0\n" - "movi v17.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v18.4s, #0\n" - "movi v19.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #160]") - "movi v20.4s, #0\n" - "movi v21.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #160]") - "movi v22.4s, #0\n" - "movi v23.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v24.4s, #0\n" - "add %x[a_ptr], %x[a_ptr], #0x10\n" - "movi v25.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v26.4s, #0\n" - "add %x[b_ptr], %x[b_ptr], #0x18\n" - "movi v27.4s, #0\n" - "movi v28.4s, #0\n" - - "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. - - "1:\n" // Main loop - // First unroll - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - // Second unroll - "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper - "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper - "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "add %x[a_ptr], %x[a_ptr], #0x20\n" - "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "subs %x[k], %x[k], #0x1\n" - "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper - "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "add %x[b_ptr], %x[b_ptr], #0x30\n" - "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "bne 1b\n" - - "2:\n" // Even tail - "cbnz %x[odd_k], 3f\n" - - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "add %[a_ptr], %[a_ptr], #0x10\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "add %[b_ptr], %[b_ptr], #0x18\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" - "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "str q19, [%x[c_ptr], #0x130]\n" - "b 4f\n" // Complete write out - - "3:\n" // Odd tail - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - - "4:\n" // End of function - "str q19, [%x[c_ptr], #0x130]\n" - "str q27, [%x[c_ptr], #0x140]\n" - "str q12, [%x[c_ptr], #0x150]\n" - "str q20, [%x[c_ptr], #0x160]\n" - "str q28, [%x[c_ptr], #0x170]\n" - "add %x[c_ptr], %x[c_ptr], #0x180\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), - [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) - : [odd_k] "r" (odd_k) - : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" - ); - } - } -} - -} // namespace arm_gemm - -#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp new file mode 100644 index 0000000000..8bf8d8442e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_gemm_s16_asimd_8x12(const int16_t *, const int16_t *, int32_t *, int, int, int); + +// 8x12 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class cls_a64_gemm_s16_8x12 { +public: + typedef int16_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() { + return 12; + } + + static unsigned int out_height() { + return 8; + } + + static unsigned int k_unroll() { + return 1; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + + kern_type kernel = a64_gemm_s16_asimd_8x12; + + cls_a64_gemm_s16_8x12(const CPUInfo *) { } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp new file mode 100644 index 0000000000..a77938ffa7 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2017 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_s16_asimd_8x12(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const int16_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + for (int yb = 0; yb < ablocks; yb++) + { + const int16_t *a_ptr0 = a_ptr; + const int16_t *b_ptr = Bpanel; + + for (int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + const bool odd_k = K & 0x1; + int k = (K+1)/2 - 1; + + register int16x8_t aa asm("v0"); + register int16x8_t ab asm("v1"); + register int16x8_t b0 asm("v2"); + register int16x8_t b1 asm("v3"); + register int16x8_t b2 asm("v4"); + + __asm __volatile ( + "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower + "movi v5.4s, #0\n" + "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper + "movi v6.4s, #0\n" + "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper + "movi v7.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v8.4s, #0\n" + "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper + "movi v9.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v10.4s, #0\n" + "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper + "movi v11.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #96]") + "movi v12.4s, #0\n" + "movi v13.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #96]") + "movi v14.4s, #0\n" + "movi v15.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0\n" + "movi v17.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v18.4s, #0\n" + "movi v19.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #160]") + "movi v20.4s, #0\n" + "movi v21.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #160]") + "movi v22.4s, #0\n" + "movi v23.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v24.4s, #0\n" + "add %x[a_ptr], %x[a_ptr], #0x10\n" + "movi v25.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v26.4s, #0\n" + "add %x[b_ptr], %x[b_ptr], #0x18\n" + "movi v27.4s, #0\n" + "movi v28.4s, #0\n" + + "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. + + "1:\n" // Main loop + // First unroll + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + // Second unroll + "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper + "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper + "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "add %x[a_ptr], %x[a_ptr], #0x20\n" + "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "subs %x[k], %x[k], #0x1\n" + "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper + "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "add %x[b_ptr], %x[b_ptr], #0x30\n" + "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "bne 1b\n" + + "2:\n" // Even tail + "cbnz %x[odd_k], 3f\n" + + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "add %[a_ptr], %[a_ptr], #0x10\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "add %[b_ptr], %[b_ptr], #0x18\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" + "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "str q19, [%x[c_ptr], #0x130]\n" + "b 4f\n" // Complete write out + + "3:\n" // Odd tail + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + + "4:\n" // End of function + "str q19, [%x[c_ptr], #0x130]\n" + "str q27, [%x[c_ptr], #0x140]\n" + "str q12, [%x[c_ptr], #0x150]\n" + "str q20, [%x[c_ptr], #0x160]\n" + "str q28, [%x[c_ptr], #0x170]\n" + "add %x[c_ptr], %x[c_ptr], #0x180\n" + : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), + [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) + : [odd_k] "r" (odd_k) + : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" + ); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp deleted file mode 100644 index 0e294bfe8d..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "arm_gemm.hpp" - -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm { - -// Load the actual kernel -void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int); -void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int); -void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int); - -class gemm_s8_12x8 { -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() { - return 12; - } - - static unsigned int out_height() { - return 8; - } - - static unsigned int k_unroll() { - return 4; - } - - // Use the standard fixed size transforms. - StdTransformsFixed transforms = {}; - - kern_type kernel = a64_gemm_s8_12x8; - - gemm_s8_12x8(const CPUInfo *ci) { - auto mod = ci->get_cpu_model(); - - if (mod == CPUModel::A55r1) { - kernel = a64_gemm_s8_12x8_a55r1; - } else if (mod == CPUModel::X1) { - kernel = a64_gemm_s8_12x8_x1; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp deleted file mode 100644 index ddd8124ec9..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - - // We divide K by 4 because the sdot instruction processes 4 elements at a time. - const int W = K/4; - - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int k_iters = ((W+1)/2) - 1; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - // We divide K by 4 because the sdot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - // We divide K by 4 because the sdot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb transforms = {}; + StdTransformsFixed transforms_quantized = {}; kern_type kernel=a64_gemm_s8_4x4; - gemm_s8_4x4(const CPUInfo *) { } + cls_a64_gemm_s8_4x4(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp new file mode 100644 index 0000000000..eee817e8e7 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "arm_gemm.hpp" + +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Load the actual kernel +void a64_gemm_s8_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int); +void a64_gemm_s8_8x12_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int); +void a64_gemm_s8_8x12_x1(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class cls_a64_gemm_s8_8x12 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() { + return 12; + } + + static unsigned int out_height() { + return 8; + } + + static unsigned int k_unroll() { + return 4; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + + kern_type kernel = a64_gemm_s8_8x12; + + cls_a64_gemm_s8_8x12(const CPUInfo *ci) { + auto mod = ci->get_cpu_model(); + + if (mod == CPUModel::A55r1) { + kernel = a64_gemm_s8_8x12_a55r1; + } else if (mod == CPUModel::X1) { + kernel = a64_gemm_s8_8x12_x1; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp new file mode 100644 index 0000000000..bb5226e093 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K/4; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int k_iters = ((W+1)/2) - 1; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb transforms = {}; - - kern_type kernel = a64_gemm_u16_asimd_12x8; - - gemm_u16_12x8(const CPUInfo *) { } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp deleted file mode 100644 index 66f0b7c0ac..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) -{ - const uint16_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - for (int yb = 0; yb < ablocks; yb++) - { - const uint16_t *a_ptr0 = a_ptr; - const uint16_t *b_ptr = Bpanel; - - for (int xb = 0; xb < bblocks; xb++) - { - a_ptr = a_ptr0; - const bool odd_k = K & 0x1; - int k = (K+1)/2 - 1; - - register uint16x8_t aa asm("v0"); - register uint16x8_t ab asm("v1"); - register uint16x8_t b0 asm("v2"); - register uint16x8_t b1 asm("v3"); - register uint16x8_t b2 asm("v4"); - - __asm __volatile ( - "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower - "movi v5.4s, #0\n" - "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper - "movi v6.4s, #0\n" - "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper - "movi v7.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v8.4s, #0\n" - "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper - "movi v9.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v10.4s, #0\n" - "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper - "movi v11.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #96]") - "movi v12.4s, #0\n" - "movi v13.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #96]") - "movi v14.4s, #0\n" - "movi v15.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0\n" - "movi v17.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v18.4s, #0\n" - "movi v19.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #160]") - "movi v20.4s, #0\n" - "movi v21.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #160]") - "movi v22.4s, #0\n" - "movi v23.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v24.4s, #0\n" - "add %x[a_ptr], %x[a_ptr], #0x10\n" - "movi v25.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v26.4s, #0\n" - "add %x[b_ptr], %x[b_ptr], #0x18\n" - "movi v27.4s, #0\n" - "movi v28.4s, #0\n" - - "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. - - "1:\n" // Main loop - // First unroll - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - // Second unroll - "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper - "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper - "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "add %x[a_ptr], %x[a_ptr], #0x20\n" - "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "subs %x[k], %x[k], #0x1\n" - "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper - "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "add %x[b_ptr], %x[b_ptr], #0x30\n" - "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "bne 1b\n" - - "2:\n" // Even tail - "cbnz %x[odd_k], 3f\n" - - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "add %[a_ptr], %[a_ptr], #0x10\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "add %[b_ptr], %[b_ptr], #0x18\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" - "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "str q19, [%x[c_ptr], #0x130]\n" - "b 4f\n" // Complete write out - - "3:\n" // Odd tail - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - - "4:\n" // End of function - "str q19, [%x[c_ptr], #0x130]\n" - "str q27, [%x[c_ptr], #0x140]\n" - "str q12, [%x[c_ptr], #0x150]\n" - "str q20, [%x[c_ptr], #0x160]\n" - "str q28, [%x[c_ptr], #0x170]\n" - "add %x[c_ptr], %x[c_ptr], #0x180\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), - [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) - : [odd_k] "r" (odd_k) - : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc", "memory" - ); - } - } -} - -} // namespace arm_gemm - -#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp new file mode 100644 index 0000000000..e49ebbd84e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_gemm_u16_asimd_8x12(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); + +class cls_a64_gemm_u16_8x12 { +public: + typedef uint16_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() { + return 12; + } + + static unsigned int out_height() { + return 8; + } + + static unsigned int k_unroll() { + return 1; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + + kern_type kernel = a64_gemm_u16_asimd_8x12; + + cls_a64_gemm_u16_8x12(const CPUInfo *) { } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp new file mode 100644 index 0000000000..98da7830f0 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_u16_asimd_8x12(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const uint16_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + for (int yb = 0; yb < ablocks; yb++) + { + const uint16_t *a_ptr0 = a_ptr; + const uint16_t *b_ptr = Bpanel; + + for (int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + const bool odd_k = K & 0x1; + int k = (K+1)/2 - 1; + + register uint16x8_t aa asm("v0"); + register uint16x8_t ab asm("v1"); + register uint16x8_t b0 asm("v2"); + register uint16x8_t b1 asm("v3"); + register uint16x8_t b2 asm("v4"); + + __asm __volatile ( + "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower + "movi v5.4s, #0\n" + "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper + "movi v6.4s, #0\n" + "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper + "movi v7.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v8.4s, #0\n" + "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper + "movi v9.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v10.4s, #0\n" + "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper + "movi v11.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #96]") + "movi v12.4s, #0\n" + "movi v13.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #96]") + "movi v14.4s, #0\n" + "movi v15.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0\n" + "movi v17.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v18.4s, #0\n" + "movi v19.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #160]") + "movi v20.4s, #0\n" + "movi v21.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #160]") + "movi v22.4s, #0\n" + "movi v23.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v24.4s, #0\n" + "add %x[a_ptr], %x[a_ptr], #0x10\n" + "movi v25.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v26.4s, #0\n" + "add %x[b_ptr], %x[b_ptr], #0x18\n" + "movi v27.4s, #0\n" + "movi v28.4s, #0\n" + + "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. + + "1:\n" // Main loop + // First unroll + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + // Second unroll + "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper + "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper + "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "add %x[a_ptr], %x[a_ptr], #0x20\n" + "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "subs %x[k], %x[k], #0x1\n" + "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper + "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "add %x[b_ptr], %x[b_ptr], #0x30\n" + "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "bne 1b\n" + + "2:\n" // Even tail + "cbnz %x[odd_k], 3f\n" + + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "add %[a_ptr], %[a_ptr], #0x10\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "add %[b_ptr], %[b_ptr], #0x18\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" + "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "str q19, [%x[c_ptr], #0x130]\n" + "b 4f\n" // Complete write out + + "3:\n" // Odd tail + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + + "4:\n" // End of function + "str q19, [%x[c_ptr], #0x130]\n" + "str q27, [%x[c_ptr], #0x140]\n" + "str q12, [%x[c_ptr], #0x150]\n" + "str q20, [%x[c_ptr], #0x160]\n" + "str q28, [%x[c_ptr], #0x170]\n" + "add %x[c_ptr], %x[c_ptr], #0x180\n" + : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), + [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) + : [odd_k] "r" (odd_k) + : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc", "memory" + ); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp deleted file mode 100644 index c0990ecd57..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm { - -// Load the actual kernel -void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); -void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); -void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - -class gemm_u8_12x8 { -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 4; - static const bool A_transpose = false; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 4; - static const bool B_transpose = true; - - /* Kernel blocking parameters */ - static unsigned int out_width() { - return 12; - } - - static unsigned int out_height() { - return 8; - } - - static unsigned int k_unroll() { - return 4; - } - - // Use the standard fixed sized transforms. - StdTransformsFixed transforms = {}; - - kern_type kernel = a64_gemm_u8_12x8; - - gemm_u8_12x8(const CPUInfo *ci) { - auto mod = ci->get_cpu_model(); - - if (mod == CPUModel::A55r1) { - kernel = a64_gemm_u8_12x8_a55r1; - } else if (mod == CPUModel::X1) { - kernel = a64_gemm_u8_12x8_x1; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp deleted file mode 100644 index c9a8a8229c..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - // We divide K by 4 because the udot instruction processes 4 elements at a time. - const int W = K/4; - - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int k_iters = ((W+1)/2) - 1; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - // We divide K by 4 because the udot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - // We divide K by 4 because the udot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb transforms = {}; + StdTransformsFixed transforms_quantized = {}; kern_type kernel = a64_gemm_u8_4x4; - gemm_u8_4x4(const CPUInfo *) { } + cls_a64_gemm_u8_4x4(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp new file mode 100644 index 0000000000..256ba2e08c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Load the actual kernel +void a64_gemm_u8_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); +void a64_gemm_u8_8x12_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); +void a64_gemm_u8_8x12_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class cls_a64_gemm_u8_8x12 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 4; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 4; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static unsigned int out_width() { + return 12; + } + + static unsigned int out_height() { + return 8; + } + + static unsigned int k_unroll() { + return 4; + } + + // Use the standard fixed sized transforms. + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + + kern_type kernel = a64_gemm_u8_8x12; + + cls_a64_gemm_u8_8x12(const CPUInfo *ci) { + auto mod = ci->get_cpu_model(); + + if (mod == CPUModel::A55r1) { + kernel = a64_gemm_u8_8x12_a55r1; + } else if (mod == CPUModel::X1) { + kernel = a64_gemm_u8_8x12_x1; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp new file mode 100644 index 0000000000..63869c9fd4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K/4; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int k_iters = ((W+1)/2) - 1; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_gemv_fp32_mla_32; + + cls_a64_gemv_fp32_mla_32(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp new file mode 100644 index 0000000000..a2af8d6d14 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp @@ -0,0 +1,1546 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_gemv_fp32_mla_32 ( + const float *A_ptr, const float *B_ptr, float *output_ptr, + size_t N, size_t K, + const float *bias, Activation act, bool +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + const float *B_ptr = {}; + size_t output_offset = {}; + unsigned int input_initial_col = {}; + } ka; + + unsigned long flags=0; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "add x22, %x[N], #0x3\n" + "mov x21, %x[bias]\n" + "lsr x22, x22, #0x2\n" + "1:" // Column loop + "cmp x22, #0x8\n" + "bge 85f\n" + "cmp x22, #0x6\n" + "bgt 73f\n" + "beq 61f\n" + "cmp x22, #0x4\n" + "bgt 49f\n" + "beq 37f\n" + "cmp x22, #0x2\n" + "bgt 25f\n" + "beq 13f\n" + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 2f\n" + "ldr q24, [x21, #0x0]\n" + "add x21, x21, #0x10\n" + "b 3f\n" + "2:" // Width 1: no bias + "movi v24.16b, #0x0\n" + "3:" // Width 1: setup done + "cmp x20, #0x4\n" + "blt 6f\n" + "cmp x20, #0x8\n" + "blt 5f\n" + "4:" // Width 1: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q2, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v2.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q3, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v3.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q4, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v4.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x20, x20, #0x4\n" + "prfm pldl1keep, [x19, #0x80]\n" + "cmp x20, #0x8\n" + "bge 4b\n" + "5:" // Width 1: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q5, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v5.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q6, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v6.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q7, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v7.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q8, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "6:" // Width 1: Multiply loop: Main loop skip + "cbz x20, 8f\n" + "7:" // Width 1: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q9, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v9.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "sub x20, x20, #0x1\n" + "cbnz x20, 7b\n" + "8:" // Width 1: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 9f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "9:" // Width 1: No activation + "cmp %x[N], #0x4\n" + "blt 10f\n" + "str q24, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 12f\n" + "10:" // Width 1: Partial writeback + "tbz %x[N], #1, 11f\n" + "str d24, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 12f\n" + "st1 { v24.s }[2], [%x[output_ptr]]\n" + "b 12f\n" + "11:" // Width 1: Partial direct writeback: partial_1_0 + "str s24, [%x[output_ptr], #0x0]\n" + "12:" // Width 1: Writeback done + "b 97f\n" + "13:" // Width 2 + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 14f\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "add x21, x21, #0x20\n" + "b 15f\n" + "14:" // Width 2: no bias + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "15:" // Width 2: setup done + "cmp x20, #0x4\n" + "blt 18f\n" + "cmp x20, #0x8\n" + "blt 17f\n" + "16:" // Width 2: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x10]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v25.4s, v2.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q3, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v3.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q4, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v4.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q5, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v5.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q6, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v6.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q7, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v7.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q8, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x20, x20, #0x4\n" + "prfm pldl1keep, [x19, #0x80]\n" + "cmp x20, #0x8\n" + "bge 16b\n" + "17:" // Width 2: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q9, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v9.4s, v0.s[0]\n" + "ldr q10, [%x[B_ptr], #0x10]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v25.4s, v10.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q11, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v11.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q12, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v12.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q13, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v13.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q14, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v14.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q15, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q16, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v16.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "18:" // Width 2: Multiply loop: Main loop skip + "cbz x20, 20f\n" + "19:" // Width 2: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q17, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v17.4s, v0.s[0]\n" + "ldr q18, [%x[B_ptr], #0x10]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v25.4s, v18.4s, v0.s[0]\n" + "sub x20, x20, #0x1\n" + "cbnz x20, 19b\n" + "20:" // Width 2: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 21f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "21:" // Width 2: No activation + "str q24, [%x[output_ptr], #0x0]\n" + "cmp %x[N], #0x8\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "blt 22f\n" + "str q25, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 24f\n" + "22:" // Width 2: Partial writeback + "tbz %x[N], #1, 23f\n" + "str d25, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 24f\n" + "st1 { v25.s }[2], [%x[output_ptr]]\n" + "b 24f\n" + "23:" // Width 2: Partial direct writeback: partial_1_4 + "tbz %x[N], #0, 24f\n" + "str s25, [%x[output_ptr], #0x0]\n" + "24:" // Width 2: Writeback done + "b 97f\n" + "25:" // Width 3 + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 26f\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "add x21, x21, #0x30\n" + "b 27f\n" + "26:" // Width 3: no bias + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "27:" // Width 3: setup done + "cmp x20, #0x4\n" + "blt 30f\n" + "cmp x20, #0x8\n" + "blt 29f\n" + "28:" // Width 3: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x10]\n" + "ldr q3, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v2.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v26.4s, v3.4s, v0.s[0]\n" + "ldr q4, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v4.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q5, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v5.4s, v0.s[1]\n" + "ldr q6, [%x[B_ptr], #0x20]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v6.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q7, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v7.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q8, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v8.4s, v0.s[2]\n" + "ldr q9, [%x[B_ptr], #0x20]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v9.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q10, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v10.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q11, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v11.4s, v0.s[3]\n" + "ldr q12, [%x[B_ptr], #0x20]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v12.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x20, x20, #0x4\n" + "prfm pldl1keep, [x19, #0x80]\n" + "cmp x20, #0x8\n" + "bge 28b\n" + "29:" // Width 3: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q13, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v13.4s, v0.s[0]\n" + "ldr q14, [%x[B_ptr], #0x10]\n" + "ldr q15, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v14.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v26.4s, v15.4s, v0.s[0]\n" + "ldr q16, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q17, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v17.4s, v0.s[1]\n" + "ldr q18, [%x[B_ptr], #0x20]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v18.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q19, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v19.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q20, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v20.4s, v0.s[2]\n" + "ldr q21, [%x[B_ptr], #0x20]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v21.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q22, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v22.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q23, [%x[B_ptr], #0x10]\n" + "fmla v25.4s, v23.4s, v0.s[3]\n" + "ldr q1, [%x[B_ptr], #0x20]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v1.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "30:" // Width 3: Multiply loop: Main loop skip + "cbz x20, 32f\n" + "31:" // Width 3: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q2, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v2.4s, v0.s[0]\n" + "ldr q3, [%x[B_ptr], #0x10]\n" + "ldr q4, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v3.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v4.4s, v0.s[0]\n" + "sub x20, x20, #0x1\n" + "cbnz x20, 31b\n" + "32:" // Width 3: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 33f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "33:" // Width 3: No activation + "str q24, [%x[output_ptr], #0x0]\n" + "str q25, [%x[output_ptr], #0x10]\n" + "cmp %x[N], #0xc\n" + "add %x[output_ptr], %x[output_ptr], #0x20\n" + "blt 34f\n" + "str q26, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 36f\n" + "34:" // Width 3: Partial writeback + "tbz %x[N], #1, 35f\n" + "str d26, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 36f\n" + "st1 { v26.s }[2], [%x[output_ptr]]\n" + "b 36f\n" + "35:" // Width 3: Partial direct writeback: partial_1_8 + "tbz %x[N], #0, 36f\n" + "str s26, [%x[output_ptr], #0x0]\n" + "36:" // Width 3: Writeback done + "b 97f\n" + "37:" // Width 4 + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 38f\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "add x21, x21, #0x40\n" + "b 39f\n" + "38:" // Width 4: no bias + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "39:" // Width 4: setup done + "cmp x20, #0x4\n" + "blt 42f\n" + "cmp x20, #0x8\n" + "blt 41f\n" + "40:" // Width 4: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x10]\n" + "ldr q3, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v2.4s, v0.s[0]\n" + "ldr q4, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v3.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v4.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q5, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v5.4s, v0.s[1]\n" + "ldr q6, [%x[B_ptr], #0x10]\n" + "ldr q7, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v6.4s, v0.s[1]\n" + "ldr q8, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v7.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q9, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v9.4s, v0.s[2]\n" + "ldr q10, [%x[B_ptr], #0x10]\n" + "ldr q11, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v10.4s, v0.s[2]\n" + "ldr q12, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v11.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v12.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q13, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v13.4s, v0.s[3]\n" + "ldr q14, [%x[B_ptr], #0x10]\n" + "ldr q15, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v14.4s, v0.s[3]\n" + "ldr q16, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v15.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v16.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [x19, #0x80]\n" + "sub x20, x20, #0x4\n" + "cmp x20, #0x8\n" + "bge 40b\n" + "41:" // Width 4: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q17, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v17.4s, v0.s[0]\n" + "ldr q18, [%x[B_ptr], #0x10]\n" + "ldr q19, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v18.4s, v0.s[0]\n" + "ldr q20, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v19.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v20.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q21, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v21.4s, v0.s[1]\n" + "ldr q22, [%x[B_ptr], #0x10]\n" + "ldr q23, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v22.4s, v0.s[1]\n" + "ldr q1, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v23.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v1.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q2, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v2.4s, v0.s[2]\n" + "ldr q3, [%x[B_ptr], #0x10]\n" + "ldr q4, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v3.4s, v0.s[2]\n" + "ldr q5, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v4.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v5.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q6, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v6.4s, v0.s[3]\n" + "ldr q7, [%x[B_ptr], #0x10]\n" + "ldr q8, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v7.4s, v0.s[3]\n" + "ldr q9, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v8.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v9.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add x19, x19, #0x10\n" + "prfm pldl1keep, [x19, #0x80]\n" + "42:" // Width 4: Multiply loop: Main loop skip + "cbz x20, 44f\n" + "43:" // Width 4: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q10, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v10.4s, v0.s[0]\n" + "ldr q11, [%x[B_ptr], #0x10]\n" + "ldr q12, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v11.4s, v0.s[0]\n" + "ldr q13, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v12.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "sub x20, x20, #0x1\n" + "fmla v27.4s, v13.4s, v0.s[0]\n" + "cbnz x20, 43b\n" + "44:" // Width 4: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 45f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "45:" // Width 4: No activation + "str q24, [%x[output_ptr], #0x0]\n" + "str q25, [%x[output_ptr], #0x10]\n" + "str q26, [%x[output_ptr], #0x20]\n" + "cmp %x[N], #0x10\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "blt 46f\n" + "str q27, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 48f\n" + "46:" // Width 4: Partial writeback + "tbz %x[N], #1, 47f\n" + "str d27, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 48f\n" + "st1 { v27.s }[2], [%x[output_ptr]]\n" + "b 48f\n" + "47:" // Width 4: Partial direct writeback: partial_1_12 + "tbz %x[N], #0, 48f\n" + "str s27, [%x[output_ptr], #0x0]\n" + "48:" // Width 4: Writeback done + "b 97f\n" + "49:" // Width 5 + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 50f\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "ldr q28, [x21, #0x40]\n" + "add x21, x21, #0x50\n" + "b 51f\n" + "50:" // Width 5: no bias + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "51:" // Width 5: setup done + "cmp x20, #0x4\n" + "blt 54f\n" + "cmp x20, #0x8\n" + "blt 53f\n" + "52:" // Width 5: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x10]\n" + "ldr q3, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v2.4s, v0.s[0]\n" + "ldr q4, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v3.4s, v0.s[0]\n" + "ldr q5, [%x[B_ptr], #0x40]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v27.4s, v4.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q6, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v5.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q7, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v6.4s, v0.s[1]\n" + "ldr q8, [%x[B_ptr], #0x20]\n" + "ldr q9, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v7.4s, v0.s[1]\n" + "ldr q10, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v8.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v9.4s, v0.s[1]\n" + "ldr q11, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v10.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q12, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v11.4s, v0.s[2]\n" + "ldr q13, [%x[B_ptr], #0x20]\n" + "ldr q14, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v12.4s, v0.s[2]\n" + "ldr q15, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v13.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v14.4s, v0.s[2]\n" + "ldr q16, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v15.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q17, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v16.4s, v0.s[3]\n" + "ldr q18, [%x[B_ptr], #0x20]\n" + "ldr q19, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v17.4s, v0.s[3]\n" + "ldr q20, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v18.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v19.4s, v0.s[3]\n" + "add x19, x19, #0x10\n" + "fmla v28.4s, v20.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x20, x20, #0x4\n" + "prfm pldl1keep, [x19, #0x80]\n" + "cmp x20, #0x8\n" + "bge 52b\n" + "53:" // Width 5: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q21, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v21.4s, v0.s[0]\n" + "ldr q22, [%x[B_ptr], #0x10]\n" + "ldr q23, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v22.4s, v0.s[0]\n" + "ldr q1, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v23.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x40]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v27.4s, v1.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "ldr q3, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v2.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q4, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v3.4s, v0.s[1]\n" + "ldr q5, [%x[B_ptr], #0x20]\n" + "ldr q6, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v4.4s, v0.s[1]\n" + "ldr q7, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v5.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v6.4s, v0.s[1]\n" + "ldr q8, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v7.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q9, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v8.4s, v0.s[2]\n" + "ldr q10, [%x[B_ptr], #0x20]\n" + "ldr q11, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v9.4s, v0.s[2]\n" + "ldr q12, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v10.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v11.4s, v0.s[2]\n" + "ldr q13, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v12.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q14, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v13.4s, v0.s[3]\n" + "ldr q15, [%x[B_ptr], #0x20]\n" + "ldr q16, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v14.4s, v0.s[3]\n" + "ldr q17, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v15.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v16.4s, v0.s[3]\n" + "add x19, x19, #0x10\n" + "fmla v28.4s, v17.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "54:" // Width 5: Multiply loop: Main loop skip + "cbz x20, 56f\n" + "55:" // Width 5: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q18, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v18.4s, v0.s[0]\n" + "ldr q19, [%x[B_ptr], #0x10]\n" + "ldr q20, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v19.4s, v0.s[0]\n" + "ldr q21, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v20.4s, v0.s[0]\n" + "ldr q22, [%x[B_ptr], #0x40]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v27.4s, v21.4s, v0.s[0]\n" + "sub x20, x20, #0x1\n" + "fmla v28.4s, v22.4s, v0.s[0]\n" + "cbnz x20, 55b\n" + "56:" // Width 5: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 57f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "57:" // Width 5: No activation + "str q24, [%x[output_ptr], #0x0]\n" + "str q25, [%x[output_ptr], #0x10]\n" + "str q26, [%x[output_ptr], #0x20]\n" + "str q27, [%x[output_ptr], #0x30]\n" + "cmp %x[N], #0x14\n" + "add %x[output_ptr], %x[output_ptr], #0x40\n" + "blt 58f\n" + "str q28, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 60f\n" + "58:" // Width 5: Partial writeback + "tbz %x[N], #1, 59f\n" + "str d28, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 60f\n" + "st1 { v28.s }[2], [%x[output_ptr]]\n" + "b 60f\n" + "59:" // Width 5: Partial direct writeback: partial_1_16 + "tbz %x[N], #0, 60f\n" + "str s28, [%x[output_ptr], #0x0]\n" + "60:" // Width 5: Writeback done + "b 97f\n" + "61:" // Width 6 + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 62f\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "ldr q28, [x21, #0x40]\n" + "ldr q29, [x21, #0x50]\n" + "add x21, x21, #0x60\n" + "b 63f\n" + "62:" // Width 6: no bias + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "63:" // Width 6: setup done + "cmp x20, #0x4\n" + "blt 66f\n" + "cmp x20, #0x8\n" + "blt 65f\n" + "64:" // Width 6: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x10]\n" + "ldr q3, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v2.4s, v0.s[0]\n" + "ldr q4, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v3.4s, v0.s[0]\n" + "ldr q5, [%x[B_ptr], #0x40]\n" + "ldr q6, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v4.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v5.4s, v0.s[0]\n" + "ldr q7, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q8, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v7.4s, v0.s[1]\n" + "ldr q9, [%x[B_ptr], #0x20]\n" + "ldr q10, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v8.4s, v0.s[1]\n" + "ldr q11, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v9.4s, v0.s[1]\n" + "ldr q12, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v10.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v11.4s, v0.s[1]\n" + "ldr q13, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v12.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q14, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v13.4s, v0.s[2]\n" + "ldr q15, [%x[B_ptr], #0x20]\n" + "ldr q16, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v14.4s, v0.s[2]\n" + "ldr q17, [%x[B_ptr], #0x40]\n" + "ldr q18, [%x[B_ptr], #0x50]\n" + "fmla v26.4s, v15.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v16.4s, v0.s[2]\n" + "ldr q19, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v17.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q20, [%x[B_ptr], #0x10]\n" + "fmla v29.4s, v18.4s, v0.s[2]\n" + "ldr q21, [%x[B_ptr], #0x20]\n" + "ldr q22, [%x[B_ptr], #0x30]\n" + "fmla v24.4s, v19.4s, v0.s[3]\n" + "ldr q23, [%x[B_ptr], #0x40]\n" + "ldr q1, [%x[B_ptr], #0x50]\n" + "fmla v25.4s, v20.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v26.4s, v21.4s, v0.s[3]\n" + "add x19, x19, #0x10\n" + "fmla v27.4s, v22.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x20, x20, #0x4\n" + "fmla v28.4s, v23.4s, v0.s[3]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "cmp x20, #0x8\n" + "fmla v29.4s, v1.4s, v0.s[3]\n" + "bge 64b\n" + "65:" // Width 6: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q2, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v2.4s, v0.s[0]\n" + "ldr q3, [%x[B_ptr], #0x10]\n" + "ldr q4, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v3.4s, v0.s[0]\n" + "ldr q5, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v4.4s, v0.s[0]\n" + "ldr q6, [%x[B_ptr], #0x40]\n" + "ldr q7, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v5.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v6.4s, v0.s[0]\n" + "ldr q8, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q9, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v8.4s, v0.s[1]\n" + "ldr q10, [%x[B_ptr], #0x20]\n" + "ldr q11, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v9.4s, v0.s[1]\n" + "ldr q12, [%x[B_ptr], #0x40]\n" + "fmla v26.4s, v10.4s, v0.s[1]\n" + "ldr q13, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v11.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v12.4s, v0.s[1]\n" + "ldr q14, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v13.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q15, [%x[B_ptr], #0x10]\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "ldr q16, [%x[B_ptr], #0x20]\n" + "ldr q17, [%x[B_ptr], #0x30]\n" + "fmla v25.4s, v15.4s, v0.s[2]\n" + "ldr q18, [%x[B_ptr], #0x40]\n" + "ldr q19, [%x[B_ptr], #0x50]\n" + "fmla v26.4s, v16.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v17.4s, v0.s[2]\n" + "ldr q20, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v18.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q21, [%x[B_ptr], #0x10]\n" + "fmla v29.4s, v19.4s, v0.s[2]\n" + "ldr q22, [%x[B_ptr], #0x20]\n" + "ldr q23, [%x[B_ptr], #0x30]\n" + "fmla v24.4s, v20.4s, v0.s[3]\n" + "ldr q1, [%x[B_ptr], #0x40]\n" + "ldr q2, [%x[B_ptr], #0x50]\n" + "fmla v25.4s, v21.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v26.4s, v22.4s, v0.s[3]\n" + "add x19, x19, #0x10\n" + "fmla v27.4s, v23.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "fmla v28.4s, v1.4s, v0.s[3]\n" + "fmla v29.4s, v2.4s, v0.s[3]\n" + "66:" // Width 6: Multiply loop: Main loop skip + "cbz x20, 68f\n" + "67:" // Width 6: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q3, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v3.4s, v0.s[0]\n" + "ldr q4, [%x[B_ptr], #0x10]\n" + "ldr q5, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v4.4s, v0.s[0]\n" + "ldr q6, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v5.4s, v0.s[0]\n" + "ldr q7, [%x[B_ptr], #0x40]\n" + "ldr q8, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v6.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "sub x20, x20, #0x1\n" + "fmla v28.4s, v7.4s, v0.s[0]\n" + "fmla v29.4s, v8.4s, v0.s[0]\n" + "cbnz x20, 67b\n" + "68:" // Width 6: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 69f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "69:" // Width 6: No activation + "str q24, [%x[output_ptr], #0x0]\n" + "str q25, [%x[output_ptr], #0x10]\n" + "str q26, [%x[output_ptr], #0x20]\n" + "str q27, [%x[output_ptr], #0x30]\n" + "str q28, [%x[output_ptr], #0x40]\n" + "cmp %x[N], #0x18\n" + "add %x[output_ptr], %x[output_ptr], #0x50\n" + "blt 70f\n" + "str q29, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 72f\n" + "70:" // Width 6: Partial writeback + "tbz %x[N], #1, 71f\n" + "str d29, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 72f\n" + "st1 { v29.s }[2], [%x[output_ptr]]\n" + "b 72f\n" + "71:" // Width 6: Partial direct writeback: partial_1_20 + "tbz %x[N], #0, 72f\n" + "str s29, [%x[output_ptr], #0x0]\n" + "72:" // Width 6: Writeback done + "b 97f\n" + "73:" // Width 7 + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 74f\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "ldr q28, [x21, #0x40]\n" + "ldr q29, [x21, #0x50]\n" + "ldr q30, [x21, #0x60]\n" + "add x21, x21, #0x70\n" + "b 75f\n" + "74:" // Width 7: no bias + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "75:" // Width 7: setup done + "cmp x20, #0x4\n" + "blt 78f\n" + "cmp x20, #0x8\n" + "blt 77f\n" + "76:" // Width 7: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x10]\n" + "ldr q3, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v2.4s, v0.s[0]\n" + "ldr q4, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v3.4s, v0.s[0]\n" + "ldr q5, [%x[B_ptr], #0x40]\n" + "ldr q6, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v4.4s, v0.s[0]\n" + "ldr q7, [%x[B_ptr], #0x60]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v28.4s, v5.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v29.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q8, [%x[B_ptr], #0x0]\n" + "fmla v30.4s, v7.4s, v0.s[0]\n" + "ldr q9, [%x[B_ptr], #0x10]\n" + "ldr q10, [%x[B_ptr], #0x20]\n" + "fmla v24.4s, v8.4s, v0.s[1]\n" + "ldr q11, [%x[B_ptr], #0x30]\n" + "ldr q12, [%x[B_ptr], #0x40]\n" + "fmla v25.4s, v9.4s, v0.s[1]\n" + "ldr q13, [%x[B_ptr], #0x50]\n" + "fmla v26.4s, v10.4s, v0.s[1]\n" + "ldr q14, [%x[B_ptr], #0x60]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v27.4s, v11.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v12.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q15, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v13.4s, v0.s[1]\n" + "ldr q16, [%x[B_ptr], #0x10]\n" + "ldr q17, [%x[B_ptr], #0x20]\n" + "fmla v30.4s, v14.4s, v0.s[1]\n" + "ldr q18, [%x[B_ptr], #0x30]\n" + "fmla v24.4s, v15.4s, v0.s[2]\n" + "ldr q19, [%x[B_ptr], #0x40]\n" + "ldr q20, [%x[B_ptr], #0x50]\n" + "fmla v25.4s, v16.4s, v0.s[2]\n" + "ldr q21, [%x[B_ptr], #0x60]\n" + "fmla v26.4s, v17.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v18.4s, v0.s[2]\n" + "ldr q22, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v19.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q23, [%x[B_ptr], #0x10]\n" + "fmla v29.4s, v20.4s, v0.s[2]\n" + "ldr q1, [%x[B_ptr], #0x20]\n" + "ldr q2, [%x[B_ptr], #0x30]\n" + "fmla v30.4s, v21.4s, v0.s[2]\n" + "ldr q3, [%x[B_ptr], #0x40]\n" + "fmla v24.4s, v22.4s, v0.s[3]\n" + "ldr q4, [%x[B_ptr], #0x50]\n" + "ldr q5, [%x[B_ptr], #0x60]\n" + "fmla v25.4s, v23.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v1.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v2.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add x19, x19, #0x10\n" + "fmla v28.4s, v3.4s, v0.s[3]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "sub x20, x20, #0x4\n" + "fmla v29.4s, v4.4s, v0.s[3]\n" + "cmp x20, #0x8\n" + "fmla v30.4s, v5.4s, v0.s[3]\n" + "bge 76b\n" + "77:" // Width 7: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q6, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v6.4s, v0.s[0]\n" + "ldr q7, [%x[B_ptr], #0x10]\n" + "ldr q8, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v7.4s, v0.s[0]\n" + "ldr q9, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v8.4s, v0.s[0]\n" + "ldr q10, [%x[B_ptr], #0x40]\n" + "ldr q11, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v9.4s, v0.s[0]\n" + "ldr q12, [%x[B_ptr], #0x60]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v28.4s, v10.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v29.4s, v11.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q13, [%x[B_ptr], #0x0]\n" + "fmla v30.4s, v12.4s, v0.s[0]\n" + "ldr q14, [%x[B_ptr], #0x10]\n" + "ldr q15, [%x[B_ptr], #0x20]\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "ldr q16, [%x[B_ptr], #0x30]\n" + "ldr q17, [%x[B_ptr], #0x40]\n" + "fmla v25.4s, v14.4s, v0.s[1]\n" + "ldr q18, [%x[B_ptr], #0x50]\n" + "fmla v26.4s, v15.4s, v0.s[1]\n" + "ldr q19, [%x[B_ptr], #0x60]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v27.4s, v16.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v17.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q20, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v18.4s, v0.s[1]\n" + "ldr q21, [%x[B_ptr], #0x10]\n" + "ldr q22, [%x[B_ptr], #0x20]\n" + "fmla v30.4s, v19.4s, v0.s[1]\n" + "ldr q23, [%x[B_ptr], #0x30]\n" + "fmla v24.4s, v20.4s, v0.s[2]\n" + "ldr q1, [%x[B_ptr], #0x40]\n" + "ldr q2, [%x[B_ptr], #0x50]\n" + "fmla v25.4s, v21.4s, v0.s[2]\n" + "ldr q3, [%x[B_ptr], #0x60]\n" + "fmla v26.4s, v22.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v23.4s, v0.s[2]\n" + "ldr q4, [%x[B_ptr], #0x0]\n" + "fmla v28.4s, v1.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q5, [%x[B_ptr], #0x10]\n" + "fmla v29.4s, v2.4s, v0.s[2]\n" + "ldr q6, [%x[B_ptr], #0x20]\n" + "ldr q7, [%x[B_ptr], #0x30]\n" + "fmla v30.4s, v3.4s, v0.s[2]\n" + "ldr q8, [%x[B_ptr], #0x40]\n" + "fmla v24.4s, v4.4s, v0.s[3]\n" + "ldr q9, [%x[B_ptr], #0x50]\n" + "ldr q10, [%x[B_ptr], #0x60]\n" + "fmla v25.4s, v5.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v26.4s, v6.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v27.4s, v7.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "add x19, x19, #0x10\n" + "fmla v28.4s, v8.4s, v0.s[3]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "fmla v29.4s, v9.4s, v0.s[3]\n" + "fmla v30.4s, v10.4s, v0.s[3]\n" + "78:" // Width 7: Multiply loop: Main loop skip + "cbz x20, 80f\n" + "79:" // Width 7: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q11, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v11.4s, v0.s[0]\n" + "ldr q12, [%x[B_ptr], #0x10]\n" + "ldr q13, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v12.4s, v0.s[0]\n" + "ldr q14, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v13.4s, v0.s[0]\n" + "ldr q15, [%x[B_ptr], #0x40]\n" + "ldr q16, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v14.4s, v0.s[0]\n" + "ldr q17, [%x[B_ptr], #0x60]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "fmla v28.4s, v15.4s, v0.s[0]\n" + "fmla v29.4s, v16.4s, v0.s[0]\n" + "sub x20, x20, #0x1\n" + "fmla v30.4s, v17.4s, v0.s[0]\n" + "cbnz x20, 79b\n" + "80:" // Width 7: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 81f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmin v30.4s, v30.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "fmax v30.4s, v30.4s, v17.4s\n" + "81:" // Width 7: No activation + "str q24, [%x[output_ptr], #0x0]\n" + "str q25, [%x[output_ptr], #0x10]\n" + "str q26, [%x[output_ptr], #0x20]\n" + "str q27, [%x[output_ptr], #0x30]\n" + "str q28, [%x[output_ptr], #0x40]\n" + "str q29, [%x[output_ptr], #0x50]\n" + "cmp %x[N], #0x1c\n" + "add %x[output_ptr], %x[output_ptr], #0x60\n" + "blt 82f\n" + "str q30, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 84f\n" + "82:" // Width 7: Partial writeback + "tbz %x[N], #1, 83f\n" + "str d30, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 84f\n" + "st1 { v30.s }[2], [%x[output_ptr]]\n" + "b 84f\n" + "83:" // Width 7: Partial direct writeback: partial_1_24 + "tbz %x[N], #0, 84f\n" + "str s30, [%x[output_ptr], #0x0]\n" + "84:" // Width 7: Writeback done + "b 97f\n" + "85:" // Width 8 + "mov x20, %x[K]\n" + "mov x19, %x[A_ptr]\n" + "cbz x21, 86f\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "ldr q28, [x21, #0x40]\n" + "ldr q29, [x21, #0x50]\n" + "ldr q30, [x21, #0x60]\n" + "ldr q31, [x21, #0x70]\n" + "add x21, x21, #0x80\n" + "b 87f\n" + "86:" // Width 8: no bias + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "87:" // Width 8: setup done + "cmp x20, #0x4\n" + "blt 90f\n" + "cmp x20, #0x8\n" + "blt 89f\n" + "88:" // Width 8: Multiply loop: Main loop head + "ldr q0, [x19, #0x0]\n" + "ldr q1, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v1.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x10]\n" + "ldr q3, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v2.4s, v0.s[0]\n" + "ldr q4, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v3.4s, v0.s[0]\n" + "ldr q5, [%x[B_ptr], #0x40]\n" + "ldr q6, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v4.4s, v0.s[0]\n" + "ldr q7, [%x[B_ptr], #0x60]\n" + "ldr q8, [%x[B_ptr], #0x70]\n" + "fmla v28.4s, v5.4s, v0.s[0]\n" + "fmla v29.4s, v6.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v30.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q9, [%x[B_ptr], #0x0]\n" + "fmla v31.4s, v8.4s, v0.s[0]\n" + "ldr q10, [%x[B_ptr], #0x10]\n" + "ldr q11, [%x[B_ptr], #0x20]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "ldr q12, [%x[B_ptr], #0x30]\n" + "ldr q13, [%x[B_ptr], #0x40]\n" + "fmla v25.4s, v10.4s, v0.s[1]\n" + "fmla v26.4s, v11.4s, v0.s[1]\n" + "ldr q14, [%x[B_ptr], #0x50]\n" + "ldr q15, [%x[B_ptr], #0x60]\n" + "fmla v27.4s, v12.4s, v0.s[1]\n" + "ldr q16, [%x[B_ptr], #0x70]\n" + "fmla v28.4s, v13.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v29.4s, v14.4s, v0.s[1]\n" + "ldr q17, [%x[B_ptr], #0x0]\n" + "fmla v30.4s, v15.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q18, [%x[B_ptr], #0x10]\n" + "fmla v31.4s, v16.4s, v0.s[1]\n" + "ldr q19, [%x[B_ptr], #0x20]\n" + "ldr q20, [%x[B_ptr], #0x30]\n" + "fmla v24.4s, v17.4s, v0.s[2]\n" + "ldr q21, [%x[B_ptr], #0x40]\n" + "ldr q22, [%x[B_ptr], #0x50]\n" + "fmla v25.4s, v18.4s, v0.s[2]\n" + "ldr q23, [%x[B_ptr], #0x60]\n" + "fmla v26.4s, v19.4s, v0.s[2]\n" + "ldr q1, [%x[B_ptr], #0x70]\n" + "fmla v27.4s, v20.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v21.4s, v0.s[2]\n" + "ldr q2, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v22.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q3, [%x[B_ptr], #0x10]\n" + "fmla v30.4s, v23.4s, v0.s[2]\n" + "ldr q4, [%x[B_ptr], #0x20]\n" + "ldr q5, [%x[B_ptr], #0x30]\n" + "fmla v31.4s, v1.4s, v0.s[2]\n" + "ldr q6, [%x[B_ptr], #0x40]\n" + "fmla v24.4s, v2.4s, v0.s[3]\n" + "ldr q7, [%x[B_ptr], #0x50]\n" + "ldr q8, [%x[B_ptr], #0x60]\n" + "fmla v25.4s, v3.4s, v0.s[3]\n" + "ldr q9, [%x[B_ptr], #0x70]\n" + "fmla v26.4s, v4.4s, v0.s[3]\n" + "fmla v27.4s, v5.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v6.4s, v0.s[3]\n" + "add x19, x19, #0x10\n" + "fmla v29.4s, v7.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x20, x20, #0x4\n" + "fmla v30.4s, v8.4s, v0.s[3]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "cmp x20, #0x8\n" + "fmla v31.4s, v9.4s, v0.s[3]\n" + "bge 88b\n" + "89:" // Width 8: Multiply loop: Single iteration only + "sub x20, x20, #0x4\n" + "ldr q0, [x19, #0x0]\n" + "ldr q10, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v10.4s, v0.s[0]\n" + "ldr q11, [%x[B_ptr], #0x10]\n" + "ldr q12, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v11.4s, v0.s[0]\n" + "ldr q13, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v12.4s, v0.s[0]\n" + "ldr q14, [%x[B_ptr], #0x40]\n" + "ldr q15, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v13.4s, v0.s[0]\n" + "ldr q16, [%x[B_ptr], #0x60]\n" + "ldr q17, [%x[B_ptr], #0x70]\n" + "fmla v28.4s, v14.4s, v0.s[0]\n" + "fmla v29.4s, v15.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v30.4s, v16.4s, v0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q18, [%x[B_ptr], #0x0]\n" + "fmla v31.4s, v17.4s, v0.s[0]\n" + "ldr q19, [%x[B_ptr], #0x10]\n" + "ldr q20, [%x[B_ptr], #0x20]\n" + "fmla v24.4s, v18.4s, v0.s[1]\n" + "ldr q21, [%x[B_ptr], #0x30]\n" + "ldr q22, [%x[B_ptr], #0x40]\n" + "fmla v25.4s, v19.4s, v0.s[1]\n" + "fmla v26.4s, v20.4s, v0.s[1]\n" + "ldr q23, [%x[B_ptr], #0x50]\n" + "ldr q1, [%x[B_ptr], #0x60]\n" + "fmla v27.4s, v21.4s, v0.s[1]\n" + "ldr q2, [%x[B_ptr], #0x70]\n" + "fmla v28.4s, v22.4s, v0.s[1]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v29.4s, v23.4s, v0.s[1]\n" + "ldr q3, [%x[B_ptr], #0x0]\n" + "fmla v30.4s, v1.4s, v0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q4, [%x[B_ptr], #0x10]\n" + "fmla v31.4s, v2.4s, v0.s[1]\n" + "ldr q5, [%x[B_ptr], #0x20]\n" + "ldr q6, [%x[B_ptr], #0x30]\n" + "fmla v24.4s, v3.4s, v0.s[2]\n" + "ldr q7, [%x[B_ptr], #0x40]\n" + "ldr q8, [%x[B_ptr], #0x50]\n" + "fmla v25.4s, v4.4s, v0.s[2]\n" + "ldr q9, [%x[B_ptr], #0x60]\n" + "fmla v26.4s, v5.4s, v0.s[2]\n" + "ldr q10, [%x[B_ptr], #0x70]\n" + "fmla v27.4s, v6.4s, v0.s[2]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v7.4s, v0.s[2]\n" + "ldr q11, [%x[B_ptr], #0x0]\n" + "fmla v29.4s, v8.4s, v0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ldr q12, [%x[B_ptr], #0x10]\n" + "fmla v30.4s, v9.4s, v0.s[2]\n" + "ldr q13, [%x[B_ptr], #0x20]\n" + "ldr q14, [%x[B_ptr], #0x30]\n" + "fmla v31.4s, v10.4s, v0.s[2]\n" + "ldr q15, [%x[B_ptr], #0x40]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q16, [%x[B_ptr], #0x50]\n" + "ldr q17, [%x[B_ptr], #0x60]\n" + "fmla v25.4s, v12.4s, v0.s[3]\n" + "ldr q18, [%x[B_ptr], #0x70]\n" + "fmla v26.4s, v13.4s, v0.s[3]\n" + "fmla v27.4s, v14.4s, v0.s[3]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla v28.4s, v15.4s, v0.s[3]\n" + "add x19, x19, #0x10\n" + "fmla v29.4s, v16.4s, v0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla v30.4s, v17.4s, v0.s[3]\n" + "prfm pldl1keep, [x19, #0x80]\n" + "fmla v31.4s, v18.4s, v0.s[3]\n" + "90:" // Width 8: Multiply loop: Main loop skip + "cbz x20, 92f\n" + "91:" // Width 8: Multiply loop: Odd block loop + "ldr s0, [x19], #0x4\n" + "ldr q19, [%x[B_ptr], #0x0]\n" + "fmla v24.4s, v19.4s, v0.s[0]\n" + "ldr q20, [%x[B_ptr], #0x10]\n" + "ldr q21, [%x[B_ptr], #0x20]\n" + "fmla v25.4s, v20.4s, v0.s[0]\n" + "ldr q22, [%x[B_ptr], #0x30]\n" + "fmla v26.4s, v21.4s, v0.s[0]\n" + "ldr q23, [%x[B_ptr], #0x40]\n" + "ldr q1, [%x[B_ptr], #0x50]\n" + "fmla v27.4s, v22.4s, v0.s[0]\n" + "ldr q2, [%x[B_ptr], #0x60]\n" + "ldr q3, [%x[B_ptr], #0x70]\n" + "fmla v28.4s, v23.4s, v0.s[0]\n" + "fmla v29.4s, v1.4s, v0.s[0]\n" + "add %x[B_ptr], %x[B_ptr], #0x80\n" + "sub x20, x20, #0x1\n" + "fmla v30.4s, v2.4s, v0.s[0]\n" + "fmla v31.4s, v3.4s, v0.s[0]\n" + "cbnz x20, 91b\n" + "92:" // Width 8: Multiply loop: No odd multiplies + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 93f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmin v30.4s, v30.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "fmax v30.4s, v30.4s, v17.4s\n" + "fmin v31.4s, v31.4s, v16.4s\n" + "fmax v31.4s, v31.4s, v17.4s\n" + "93:" // Width 8: No activation + "str q24, [%x[output_ptr], #0x0]\n" + "str q25, [%x[output_ptr], #0x10]\n" + "str q26, [%x[output_ptr], #0x20]\n" + "str q27, [%x[output_ptr], #0x30]\n" + "str q28, [%x[output_ptr], #0x40]\n" + "str q29, [%x[output_ptr], #0x50]\n" + "str q30, [%x[output_ptr], #0x60]\n" + "cmp %x[N], #0x20\n" + "add %x[output_ptr], %x[output_ptr], #0x70\n" + "blt 94f\n" + "str q31, [%x[output_ptr], #0x0]\n" + "add %x[output_ptr], %x[output_ptr], #0x10\n" + "b 96f\n" + "94:" // Width 8: Partial writeback + "tbz %x[N], #1, 95f\n" + "str d31, [%x[output_ptr]], #0x8\n" + "tbz %x[N], #0, 96f\n" + "st1 { v31.s }[2], [%x[output_ptr]]\n" + "b 96f\n" + "95:" // Width 8: Partial direct writeback: partial_1_28 + "tbz %x[N], #0, 96f\n" + "str s31, [%x[output_ptr], #0x0]\n" + "96:" // Width 8: Writeback done + "subs x22, x22, #0x8\n" + "sub %x[N], %x[N], #0x20\n" + "bgt 1b\n" + "97:" // Exit + + : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr) + : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" + ); +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp deleted file mode 100644 index 79cae6002a..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) - -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); -void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); -void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); - -// 24x8 HGEMM "strategy" class. Describes the kernel properties. -// -// The generic "gemm_opt" function will instantiate one of these (allowing -// the constructor to pick a kernel implementation). -class hgemm_24x8 { -public: - typedef __fp16 operand_type; - typedef __fp16 result_type; - - typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() { - return 24; - } - - static unsigned int out_height() { - return 8; - } - - static unsigned int k_unroll() { - return 1; - } - - // Use the standard fixed size transforms. - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel = a64_hgemm_asimd_24x8; - - hgemm_24x8(const CPUInfo *ci) { - auto model = ci->get_cpu_model(); - - if (model == CPUModel::A55r1) { - kernel = a64_hgemm_asimd_24x8_a55r1; - } else if (model == CPUModel::X1) { - kernel = a64_hgemm_asimd_24x8_x1; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp deleted file mode 100644 index 829ae30001..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported. -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) - -#include - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -namespace arm_gemm { - -void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k_iters = ((K+1)/2) - 1; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 24x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -namespace arm_gemm { - -void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 24x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -namespace arm_gemm { - -void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - - for (int yb=0; yb transforms = {}; + + // Default to the generic kernel + kern_type kernel = a64_hgemm_asimd_8x24; + + cls_a64_hgemm_8x24(const CPUInfo *ci) { + auto model = ci->get_cpu_model(); + + if (model == CPUModel::A55r1) { + kernel = a64_hgemm_asimd_8x24_a55r1; + } else if (model == CPUModel::X1) { + kernel = a64_hgemm_asimd_8x24_x1; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp new file mode 100644 index 0000000000..29cdd33893 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported. +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k_iters = ((K+1)/2) - 1; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 8x24), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 8x24), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + for (int yb=0; yb, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_bf16fp32_dot_6x16( ARGLIST ); + +class cls_a64_hybrid_bf16fp32_dot_6x16 +{ +public: + typedef bfloat16 operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 2; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_bf16fp32_dot_6x16; + + cls_a64_hybrid_bf16fp32_dot_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp new file mode 100644 index 0000000000..be680ed645 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp @@ -0,0 +1,3668 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_bf16fp32_dot_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 186f\n" + "cmp %x[M], #0x4\n" + "bgt 149f\n" + "beq 112f\n" + "cmp %x[M], #0x2\n" + "bgt 75f\n" + "beq 38f\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "cbz x14, 4f\n" + "ldr q8, [x14, #0x0]\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "ldr q11, [x14, #0x30]\n" + "add x14, x14, #0x40\n" + "b 15f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 14f\n" + "cmp x16, #0x10\n" + "bge 13f\n" + "tbz x16, #3, 8f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "tbz x16, #2, 6f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "tbz x16, #1, 5f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "tbz x16, #0, 12f\n" + "ld1 { v11.s }[2], [x13]\n" + "b 12f\n" + "5:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 12f\n" + "ldr s11, [x13, #0x0]\n" + "b 12f\n" + "6:" // Height 1: Partial accumulate: partial_2_8 + "tbz x16, #1, 7f\n" + "ldr d10, [x13], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 12f\n" + "ld1 { v10.s }[2], [x13]\n" + "b 12f\n" + "7:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 12f\n" + "ldr s10, [x13, #0x0]\n" + "b 12f\n" + "8:" // Height 1: Partial accumulate: partial_4_0 + "tbz x16, #2, 10f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "tbz x16, #1, 9f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "tbz x16, #0, 12f\n" + "ld1 { v9.s }[2], [x13]\n" + "b 12f\n" + "9:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 12f\n" + "ldr s9, [x13, #0x0]\n" + "b 12f\n" + "10:" // Height 1: Partial accumulate: partial_2_0 + "tbz x16, #1, 11f\n" + "ldr d8, [x13], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 12f\n" + "ld1 { v8.s }[2], [x13]\n" + "b 12f\n" + "11:" // Height 1: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "12:" // Height 1: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 15f\n" + "13:" // Height 1: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "b 15f\n" + "14:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "15:" // Height 1: setup done + "mov x12, #0x0\n" + "16:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 18f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "b 18f\n" + "17:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "18:" // Height 1: input setup done + "cmp x11, #0x8\n" + "blt 21f\n" + "cmp x11, #0x10\n" + "blt 20f\n" + "19:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "sub x11, x11, #0x8\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "cmp x11, #0x10\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + "bge 19b\n" + "20:" // Height 1: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + "21:" // Height 1: Multiply loop: Main loop skip + "cbz x11, 26f\n" + "cmp x11, #0x2\n" + "blt 23f\n" + "22:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "sub x11, x11, #0x2\n" + "add x15, x15, #0x40\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + "cmp x11, #0x2\n" + "bge 22b\n" + "cbz x11, 26f\n" + "23:" // Height 1: Multiply loop: Skip odd blocks + "tbz x11, #1, 24f\n" + "ldr s0, [x10], #0x4\n" + "tbz x11, #0, 25f\n" + "ld1 { v0.h }[2], [x10]\n" + "b 25f\n" + "24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr h0, [x10, #0x0]\n" + "25:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + "26:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 16b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "27:" // Height 1: No activation + "cmp x16, #0x10\n" + "bge 36f\n" + "tbz x16, #3, 31f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "tbz x16, #2, 29f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "tbz x16, #1, 28f\n" + "str d11, [x13], #0x8\n" + "tbz x16, #0, 35f\n" + "st1 { v11.s }[2], [x13]\n" + "b 35f\n" + "28:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x16, #0, 35f\n" + "str s11, [x13, #0x0]\n" + "b 35f\n" + "29:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x16, #1, 30f\n" + "str d10, [x13], #0x8\n" + "tbz x16, #0, 35f\n" + "st1 { v10.s }[2], [x13]\n" + "b 35f\n" + "30:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x16, #0, 35f\n" + "str s10, [x13, #0x0]\n" + "b 35f\n" + "31:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x16, #2, 33f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "tbz x16, #1, 32f\n" + "str d9, [x13], #0x8\n" + "tbz x16, #0, 35f\n" + "st1 { v9.s }[2], [x13]\n" + "b 35f\n" + "32:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x16, #0, 35f\n" + "str s9, [x13, #0x0]\n" + "b 35f\n" + "33:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x16, #1, 34f\n" + "str d8, [x13], #0x8\n" + "tbz x16, #0, 35f\n" + "st1 { v8.s }[2], [x13]\n" + "b 35f\n" + "34:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "35:" // Height 1: Partial direct writeback: Done + "b 37f\n" + "36:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "37:" // Height 1: Writeback done + "subs x16, x16, #0x10\n" + "bgt 3b\n" + "b 224f\n" + "38:" // Height 2 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 39f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 40f\n" + "39:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "40:" // Height 2: Column loop + "cbz x14, 41f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "mov v13.16b, v9.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v14.16b, v10.16b\n" + "add x14, x14, #0x40\n" + "mov v15.16b, v11.16b\n" + "b 52f\n" + "41:" // Height 2: no bias + "tbz %x[flags], #0, 51f\n" + "cmp x16, #0x10\n" + "bge 50f\n" + "tbz x16, #3, 45f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "tbz x16, #2, 43f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "tbz x16, #1, 42f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "tbz x16, #0, 49f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "b 49f\n" + "42:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 49f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "b 49f\n" + "43:" // Height 2: Partial accumulate: partial_2_8 + "tbz x16, #1, 44f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 49f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "b 49f\n" + "44:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 49f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "b 49f\n" + "45:" // Height 2: Partial accumulate: partial_4_0 + "tbz x16, #2, 47f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "tbz x16, #1, 46f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "tbz x16, #0, 49f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "b 49f\n" + "46:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 49f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "b 49f\n" + "47:" // Height 2: Partial accumulate: partial_2_0 + "tbz x16, #1, 48f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 49f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "b 49f\n" + "48:" // Height 2: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "49:" // Height 2: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "b 52f\n" + "50:" // Height 2: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "b 52f\n" + "51:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "52:" // Height 2: setup done + "mov x12, #0x0\n" + "53:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 54f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 55f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "b 55f\n" + "54:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "55:" // Height 2: input setup done + "cmp x11, #0x8\n" + "blt 58f\n" + "cmp x11, #0x10\n" + "blt 57f\n" + "56:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "sub x11, x11, #0x8\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "cmp x11, #0x10\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + "bge 56b\n" + "57:" // Height 2: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + "58:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 63f\n" + "cmp x11, #0x2\n" + "blt 60f\n" + "59:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "sub x11, x11, #0x2\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "cmp x11, #0x2\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + "bge 59b\n" + "cbz x11, 63f\n" + "60:" // Height 2: Multiply loop: Skip odd blocks + "tbz x11, #1, 61f\n" + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "tbz x11, #0, 62f\n" + "ld1 { v0.h }[2], [x10]\n" + "ld1 { v1.h }[2], [x28]\n" + "b 62f\n" + "61:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr h0, [x10, #0x0]\n" + "ldr h1, [x28, #0x0]\n" + "62:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + "63:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 53b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 64f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "64:" // Height 2: No activation + "cmp x16, #0x10\n" + "bge 73f\n" + "tbz x16, #3, 68f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "tbz x16, #2, 66f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "tbz x16, #1, 65f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "tbz x16, #0, 72f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "b 72f\n" + "65:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x16, #0, 72f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "b 72f\n" + "66:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x16, #1, 67f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "tbz x16, #0, 72f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "b 72f\n" + "67:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x16, #0, 72f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "b 72f\n" + "68:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x16, #2, 70f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "tbz x16, #1, 69f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "tbz x16, #0, 72f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "b 72f\n" + "69:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x16, #0, 72f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "b 72f\n" + "70:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x16, #1, 71f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "tbz x16, #0, 72f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "b 72f\n" + "71:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "72:" // Height 2: Partial direct writeback: Done + "b 74f\n" + "73:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "74:" // Height 2: Writeback done + "subs x16, x16, #0x10\n" + "bgt 40b\n" + "b 224f\n" + "75:" // Height 3 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 76f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 77f\n" + "76:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "77:" // Height 3: Column loop + "cbz x14, 78f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "ldr q11, [x14, #0x30]\n" + "mov v13.16b, v9.16b\n" + "add x14, x14, #0x40\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "b 89f\n" + "78:" // Height 3: no bias + "tbz %x[flags], #0, 88f\n" + "cmp x16, #0x10\n" + "bge 87f\n" + "tbz x16, #3, 82f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "tbz x16, #2, 80f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "tbz x16, #1, 79f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "tbz x16, #0, 86f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "b 86f\n" + "79:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 86f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "b 86f\n" + "80:" // Height 3: Partial accumulate: partial_2_8 + "tbz x16, #1, 81f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 86f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "b 86f\n" + "81:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 86f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "b 86f\n" + "82:" // Height 3: Partial accumulate: partial_4_0 + "tbz x16, #2, 84f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "tbz x16, #1, 83f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "tbz x16, #0, 86f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "b 86f\n" + "83:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 86f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "b 86f\n" + "84:" // Height 3: Partial accumulate: partial_2_0 + "tbz x16, #1, 85f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 86f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "b 86f\n" + "85:" // Height 3: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "86:" // Height 3: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "b 89f\n" + "87:" // Height 3: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "b 89f\n" + "88:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "89:" // Height 3: setup done + "mov x12, #0x0\n" + "90:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 91f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 92f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "b 92f\n" + "91:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "92:" // Height 3: input setup done + "cmp x11, #0x8\n" + "blt 95f\n" + "cmp x11, #0x10\n" + "blt 94f\n" + "93:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "sub x11, x11, #0x8\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + "cmp x11, #0x10\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + "bge 93b\n" + "94:" // Height 3: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + "95:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 100f\n" + "cmp x11, #0x2\n" + "blt 97f\n" + "96:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x2\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "cmp x11, #0x2\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + "bge 96b\n" + "cbz x11, 100f\n" + "97:" // Height 3: Multiply loop: Skip odd blocks + "tbz x11, #1, 98f\n" + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "tbz x11, #0, 99f\n" + "ld1 { v0.h }[2], [x10]\n" + "ld1 { v1.h }[2], [x28]\n" + "ld1 { v2.h }[2], [x26]\n" + "b 99f\n" + "98:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr h0, [x10, #0x0]\n" + "ldr h1, [x28, #0x0]\n" + "ldr h2, [x26, #0x0]\n" + "99:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + "100:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 90b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 101f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "101:" // Height 3: No activation + "cmp x16, #0x10\n" + "bge 110f\n" + "tbz x16, #3, 105f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "tbz x16, #2, 103f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "tbz x16, #1, 102f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "tbz x16, #0, 109f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "b 109f\n" + "102:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x16, #0, 109f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "b 109f\n" + "103:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x16, #1, 104f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "tbz x16, #0, 109f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "b 109f\n" + "104:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x16, #0, 109f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "b 109f\n" + "105:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x16, #2, 107f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "tbz x16, #1, 106f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "tbz x16, #0, 109f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "b 109f\n" + "106:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x16, #0, 109f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "b 109f\n" + "107:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x16, #1, 108f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "tbz x16, #0, 109f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "b 109f\n" + "108:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "109:" // Height 3: Partial direct writeback: Done + "b 111f\n" + "110:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "111:" // Height 3: Writeback done + "subs x16, x16, #0x10\n" + "bgt 77b\n" + "b 224f\n" + "112:" // Height 4 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 113f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 114f\n" + "113:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "114:" // Height 4: Column loop + "cbz x14, 115f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "add x14, x14, #0x40\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "b 126f\n" + "115:" // Height 4: no bias + "tbz %x[flags], #0, 125f\n" + "cmp x16, #0x10\n" + "bge 124f\n" + "tbz x16, #3, 119f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "tbz x16, #2, 117f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "tbz x16, #1, 116f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "tbz x16, #0, 123f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "b 123f\n" + "116:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 123f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "b 123f\n" + "117:" // Height 4: Partial accumulate: partial_2_8 + "tbz x16, #1, 118f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 123f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "b 123f\n" + "118:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 123f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "b 123f\n" + "119:" // Height 4: Partial accumulate: partial_4_0 + "tbz x16, #2, 121f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "tbz x16, #1, 120f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "tbz x16, #0, 123f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "b 123f\n" + "120:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 123f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "b 123f\n" + "121:" // Height 4: Partial accumulate: partial_2_0 + "tbz x16, #1, 122f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 123f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "b 123f\n" + "122:" // Height 4: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "123:" // Height 4: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "b 126f\n" + "124:" // Height 4: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "b 126f\n" + "125:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "126:" // Height 4: setup done + "mov x12, #0x0\n" + "127:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 128f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 129f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 129f\n" + "128:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "129:" // Height 4: input setup done + "cmp x11, #0x8\n" + "blt 132f\n" + "cmp x11, #0x10\n" + "blt 131f\n" + "130:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x11, x11, #0x8\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "cmp x11, #0x10\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + "bge 130b\n" + "131:" // Height 4: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + "132:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 137f\n" + "cmp x11, #0x2\n" + "blt 134f\n" + "133:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x2\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "cmp x11, #0x2\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + "bge 133b\n" + "cbz x11, 137f\n" + "134:" // Height 4: Multiply loop: Skip odd blocks + "tbz x11, #1, 135f\n" + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "tbz x11, #0, 136f\n" + "ld1 { v0.h }[2], [x10]\n" + "ld1 { v1.h }[2], [x28]\n" + "ld1 { v2.h }[2], [x26]\n" + "ld1 { v3.h }[2], [x24]\n" + "b 136f\n" + "135:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr h0, [x10, #0x0]\n" + "ldr h1, [x28, #0x0]\n" + "ldr h2, [x26, #0x0]\n" + "ldr h3, [x24, #0x0]\n" + "136:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + "137:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 127b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 138f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "138:" // Height 4: No activation + "cmp x16, #0x10\n" + "bge 147f\n" + "tbz x16, #3, 142f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "tbz x16, #2, 140f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "tbz x16, #1, 139f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "tbz x16, #0, 146f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "b 146f\n" + "139:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x16, #0, 146f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "b 146f\n" + "140:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x16, #1, 141f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "tbz x16, #0, 146f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "b 146f\n" + "141:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x16, #0, 146f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "b 146f\n" + "142:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x16, #2, 144f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "tbz x16, #1, 143f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "tbz x16, #0, 146f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "b 146f\n" + "143:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x16, #0, 146f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "b 146f\n" + "144:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x16, #1, 145f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x16, #0, 146f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "b 146f\n" + "145:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "146:" // Height 4: Partial direct writeback: Done + "b 148f\n" + "147:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "148:" // Height 4: Writeback done + "subs x16, x16, #0x10\n" + "bgt 114b\n" + "b 224f\n" + "149:" // Height 5 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 150f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 151f\n" + "150:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "151:" // Height 5: Column loop + "cbz x14, 152f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v24.16b, v8.16b\n" + "add x14, x14, #0x40\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "b 163f\n" + "152:" // Height 5: no bias + "tbz %x[flags], #0, 162f\n" + "cmp x16, #0x10\n" + "bge 161f\n" + "tbz x16, #3, 156f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "tbz x16, #2, 154f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "tbz x16, #1, 153f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "tbz x16, #0, 160f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "b 160f\n" + "153:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 160f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "b 160f\n" + "154:" // Height 5: Partial accumulate: partial_2_8 + "tbz x16, #1, 155f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 160f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "b 160f\n" + "155:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 160f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "b 160f\n" + "156:" // Height 5: Partial accumulate: partial_4_0 + "tbz x16, #2, 158f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "tbz x16, #1, 157f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "tbz x16, #0, 160f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "b 160f\n" + "157:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 160f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "b 160f\n" + "158:" // Height 5: Partial accumulate: partial_2_0 + "tbz x16, #1, 159f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 160f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "b 160f\n" + "159:" // Height 5: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "160:" // Height 5: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "b 163f\n" + "161:" // Height 5: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "b 163f\n" + "162:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "163:" // Height 5: setup done + "mov x12, #0x0\n" + "164:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 165f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 166f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 166f\n" + "165:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "166:" // Height 5: input setup done + "cmp x11, #0x8\n" + "blt 169f\n" + "cmp x11, #0x10\n" + "blt 168f\n" + "167:" // Height 5: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x11, x11, #0x8\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + "cmp x11, #0x10\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" + "bge 167b\n" + "168:" // Height 5: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" + "169:" // Height 5: Multiply loop: Main loop skip + "cbz x11, 174f\n" + "cmp x11, #0x2\n" + "blt 171f\n" + "170:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x2\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "cmp x11, #0x2\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + "bge 170b\n" + "cbz x11, 174f\n" + "171:" // Height 5: Multiply loop: Skip odd blocks + "tbz x11, #1, 172f\n" + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "tbz x11, #0, 173f\n" + "ld1 { v0.h }[2], [x10]\n" + "ld1 { v1.h }[2], [x28]\n" + "ld1 { v2.h }[2], [x26]\n" + "ld1 { v3.h }[2], [x24]\n" + "ld1 { v4.h }[2], [x22]\n" + "b 173f\n" + "172:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr h0, [x10, #0x0]\n" + "ldr h1, [x28, #0x0]\n" + "ldr h2, [x26, #0x0]\n" + "ldr h3, [x24, #0x0]\n" + "ldr h4, [x22, #0x0]\n" + "173:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + "174:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 164b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 175f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "175:" // Height 5: No activation + "cmp x16, #0x10\n" + "bge 184f\n" + "tbz x16, #3, 179f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "tbz x16, #2, 177f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "tbz x16, #1, 176f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "tbz x16, #0, 183f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "b 183f\n" + "176:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x16, #0, 183f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "b 183f\n" + "177:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x16, #1, 178f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "tbz x16, #0, 183f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "b 183f\n" + "178:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x16, #0, 183f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "b 183f\n" + "179:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x16, #2, 181f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "tbz x16, #1, 180f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "tbz x16, #0, 183f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "b 183f\n" + "180:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x16, #0, 183f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "b 183f\n" + "181:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x16, #1, 182f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x16, #0, 183f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "b 183f\n" + "182:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "183:" // Height 5: Partial direct writeback: Done + "b 185f\n" + "184:" // Height 5: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "185:" // Height 5: Writeback done + "subs x16, x16, #0x10\n" + "bgt 151b\n" + "b 224f\n" + "186:" // Height 6 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 187f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 188f\n" + "187:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "188:" // Height 6: Column loop + "cbz x14, 189f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v24.16b, v8.16b\n" + "add x14, x14, #0x40\n" + "mov v28.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "mov v29.16b, v9.16b\n" + "mov v30.16b, v10.16b\n" + "mov v31.16b, v11.16b\n" + "b 200f\n" + "189:" // Height 6: no bias + "tbz %x[flags], #0, 199f\n" + "cmp x16, #0x10\n" + "bge 198f\n" + "tbz x16, #3, 193f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "ld1 { v29.4s }, [x21], #0x10\n" + "tbz x16, #2, 191f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "ld1 { v30.4s }, [x21], #0x10\n" + "tbz x16, #1, 190f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x16, #0, 197f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 197f\n" + "190:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 197f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "b 197f\n" + "191:" // Height 6: Partial accumulate: partial_2_8 + "tbz x16, #1, 192f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d30, [x21], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 197f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "ld1 { v30.s }[2], [x21]\n" + "b 197f\n" + "192:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 197f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "b 197f\n" + "193:" // Height 6: Partial accumulate: partial_4_0 + "tbz x16, #2, 195f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "tbz x16, #1, 194f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x16, #0, 197f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x21]\n" + "b 197f\n" + "194:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 197f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "ldr s29, [x21, #0x0]\n" + "b 197f\n" + "195:" // Height 6: Partial accumulate: partial_2_0 + "tbz x16, #1, 196f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d28, [x21], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 197f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v28.s }[2], [x21]\n" + "b 197f\n" + "196:" // Height 6: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s28, [x21, #0x0]\n" + "197:" // Height 6: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "sub x21, x21, x19\n" + "b 200f\n" + "198:" // Height 6: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "ldr q28, [x21, #0x0]\n" + "ldr q29, [x21, #0x10]\n" + "ldr q30, [x21, #0x20]\n" + "ldr q31, [x21, #0x30]\n" + "b 200f\n" + "199:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "200:" // Height 6: setup done + "mov x12, #0x0\n" + "201:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 202f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 203f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 203f\n" + "202:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "add x20, x22, x19, LSL #1\n" + "203:" // Height 6: input setup done + "cmp x11, #0x8\n" + "blt 206f\n" + "cmp x11, #0x10\n" + "blt 205f\n" + "204:" // Height 6: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x11, x11, #0x8\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + "cmp x11, #0x10\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" + ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" + ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" + ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" + ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" + ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" + ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" + ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" + ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" + ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" + ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" + ".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" + ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n" + "bge 204b\n" + "205:" // Height 6: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" + "ldr q6, [x15, #0x40]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x50]\n" + ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" + ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x60]\n" + ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" + ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x70]\n" + ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" + ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" + ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" + ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" + ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" + ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x80]\n" + ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" + ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" + ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" + ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" + ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" + ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x90]\n" + ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" + ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" + ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" + ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" + ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" + ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" + ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" + ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" + ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" + ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" + ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" + ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" + ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" + ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" + ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" + ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" + ".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n" + ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" + ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n" + "206:" // Height 6: Multiply loop: Main loop skip + "cbz x11, 211f\n" + "cmp x11, #0x2\n" + "blt 208f\n" + "207:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x2\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + "cmp x11, #0x2\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" + "bge 207b\n" + "cbz x11, 211f\n" + "208:" // Height 6: Multiply loop: Skip odd blocks + "tbz x11, #1, 209f\n" + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "tbz x11, #0, 210f\n" + "ld1 { v0.h }[2], [x10]\n" + "ld1 { v1.h }[2], [x28]\n" + "ld1 { v2.h }[2], [x26]\n" + "ld1 { v3.h }[2], [x24]\n" + "ld1 { v4.h }[2], [x22]\n" + "ld1 { v5.h }[2], [x20]\n" + "b 210f\n" + "209:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr h0, [x10, #0x0]\n" + "ldr h1, [x28, #0x0]\n" + "ldr h2, [x26, #0x0]\n" + "ldr h3, [x24, #0x0]\n" + "ldr h4, [x22, #0x0]\n" + "ldr h5, [x20, #0x0]\n" + "210:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x15, #0x0]\n" + ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" + "ldr q6, [x15, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x30]\n" + ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" + ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" + "211:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 201b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 212f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "212:" // Height 6: No activation + "cmp x16, #0x10\n" + "bge 221f\n" + "tbz x16, #3, 216f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "tbz x16, #2, 214f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "st1 { v30.4s }, [x21], #0x10\n" + "tbz x16, #1, 213f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x16, #0, 220f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "st1 { v31.s }[2], [x21]\n" + "b 220f\n" + "213:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x16, #0, 220f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "str s31, [x21, #0x0]\n" + "b 220f\n" + "214:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x16, #1, 215f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x16, #0, 220f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "st1 { v30.s }[2], [x21]\n" + "b 220f\n" + "215:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x16, #0, 220f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "str s30, [x21, #0x0]\n" + "b 220f\n" + "216:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x16, #2, 218f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "tbz x16, #1, 217f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x16, #0, 220f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "st1 { v29.s }[2], [x21]\n" + "b 220f\n" + "217:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x16, #0, 220f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s29, [x21, #0x0]\n" + "b 220f\n" + "218:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x16, #1, 219f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x16, #0, 220f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "st1 { v28.s }[2], [x21]\n" + "b 220f\n" + "219:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "str s28, [x21, #0x0]\n" + "220:" // Height 6: Partial direct writeback: Done + "b 222f\n" + "221:" // Height 6: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "str q28, [x21, #0x0]\n" + "str q29, [x21, #0x10]\n" + "str q30, [x21, #0x20]\n" + "str q31, [x21, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "add x21, x21, #0x40\n" + "222:" // Height 6: Writeback done + "subs x16, x16, #0x10\n" + "bgt 188b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 224f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 223f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "223:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "224:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp new file mode 100644 index 0000000000..46de98504e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include "../std_transforms_fixed.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg<__fp16>, \ + size_t, size_t, \ + const __fp16 *, \ + IndirectOutputArg<__fp16>, \ + const __fp16 *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_fp16_mla_6x32( ARGLIST ); + +class cls_a64_hybrid_fp16_mla_6x32 +{ +public: + typedef __fp16 operand_type; + typedef __fp16 result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 32; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_fp16_mla_6x32; + + cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp new file mode 100644 index 0000000000..ff6cbec200 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp @@ -0,0 +1,5400 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_fp16_mla_6x32 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg, + size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg, + const __fp16 *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + __fp16 maxval = static_cast<__fp16>(std::numeric_limits::infinity()); + __fp16 minval = - static_cast<__fp16>(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const __fp16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast<__fp16>(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 251f\n" + "cmp %x[M], #0x4\n" + "bgt 201f\n" + "beq 151f\n" + "cmp %x[M], #0x2\n" + "bgt 101f\n" + "beq 51f\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "cbz x14, 4f\n" + "ldr q8, [x14, #0x0]\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "ldr q11, [x14, #0x30]\n" + "add x14, x14, #0x40\n" + "b 23f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 22f\n" + "cmp x16, #0x20\n" + "bge 21f\n" + "tbz x16, #4, 12f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v9.8h }, [x13], #0x10\n" + "tbz x16, #3, 8f\n" + "ld1 { v10.8h }, [x13], #0x10\n" + "tbz x16, #2, 6f\n" + "ldr d11, [x13], #0x8\n" + "tbz x16, #1, 5f\n" + "mov x19, #0x3c\n" + "ld1 { v11.s }[2], [x13], #0x4\n" + "tbz x16, #0, 20f\n" + "ld1 { v11.h }[6], [x13]\n" + "b 20f\n" + "5:" // Height 1: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x16, #0, 20f\n" + "ld1 { v11.h }[4], [x13]\n" + "b 20f\n" + "6:" // Height 1: Partial accumulate: partial_2_24 + "tbz x16, #1, 7f\n" + "ldr s11, [x13], #0x4\n" + "mov x19, #0x34\n" + "tbz x16, #0, 20f\n" + "ld1 { v11.h }[2], [x13]\n" + "b 20f\n" + "7:" // Height 1: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x16, #0, 20f\n" + "ldr h11, [x13, #0x0]\n" + "b 20f\n" + "8:" // Height 1: Partial accumulate: partial_4_16 + "tbz x16, #2, 10f\n" + "ldr d10, [x13], #0x8\n" + "tbz x16, #1, 9f\n" + "mov x19, #0x2c\n" + "ld1 { v10.s }[2], [x13], #0x4\n" + "tbz x16, #0, 20f\n" + "ld1 { v10.h }[6], [x13]\n" + "b 20f\n" + "9:" // Height 1: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x16, #0, 20f\n" + "ld1 { v10.h }[4], [x13]\n" + "b 20f\n" + "10:" // Height 1: Partial accumulate: partial_2_16 + "tbz x16, #1, 11f\n" + "ldr s10, [x13], #0x4\n" + "mov x19, #0x24\n" + "tbz x16, #0, 20f\n" + "ld1 { v10.h }[2], [x13]\n" + "b 20f\n" + "11:" // Height 1: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x16, #0, 20f\n" + "ldr h10, [x13, #0x0]\n" + "b 20f\n" + "12:" // Height 1: Partial accumulate: partial_8_0 + "tbz x16, #3, 16f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "tbz x16, #2, 14f\n" + "ldr d9, [x13], #0x8\n" + "tbz x16, #1, 13f\n" + "mov x19, #0x1c\n" + "ld1 { v9.s }[2], [x13], #0x4\n" + "tbz x16, #0, 20f\n" + "ld1 { v9.h }[6], [x13]\n" + "b 20f\n" + "13:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x16, #0, 20f\n" + "ld1 { v9.h }[4], [x13]\n" + "b 20f\n" + "14:" // Height 1: Partial accumulate: partial_2_8 + "tbz x16, #1, 15f\n" + "ldr s9, [x13], #0x4\n" + "mov x19, #0x14\n" + "tbz x16, #0, 20f\n" + "ld1 { v9.h }[2], [x13]\n" + "b 20f\n" + "15:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x16, #0, 20f\n" + "ldr h9, [x13, #0x0]\n" + "b 20f\n" + "16:" // Height 1: Partial accumulate: partial_4_0 + "tbz x16, #2, 18f\n" + "ldr d8, [x13], #0x8\n" + "tbz x16, #1, 17f\n" + "mov x19, #0xc\n" + "ld1 { v8.s }[2], [x13], #0x4\n" + "tbz x16, #0, 20f\n" + "ld1 { v8.h }[6], [x13]\n" + "b 20f\n" + "17:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x16, #0, 20f\n" + "ld1 { v8.h }[4], [x13]\n" + "b 20f\n" + "18:" // Height 1: Partial accumulate: partial_2_0 + "tbz x16, #1, 19f\n" + "ldr s8, [x13], #0x4\n" + "mov x19, #0x4\n" + "tbz x16, #0, 20f\n" + "ld1 { v8.h }[2], [x13]\n" + "b 20f\n" + "19:" // Height 1: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr h8, [x13, #0x0]\n" + "20:" // Height 1: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 23f\n" + "21:" // Height 1: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "b 23f\n" + "22:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "23:" // Height 1: setup done + "mov x12, #0x0\n" + "24:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 25f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 26f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "b 26f\n" + "25:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "26:" // Height 1: input setup done + "cmp x11, #0x8\n" + "blt 29f\n" + "cmp x11, #0x10\n" + "blt 28f\n" + "27:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "add x10, x10, #0x10\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "sub x11, x11, #0x8\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "cmp x11, #0x10\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "bge 27b\n" + "28:" // Height 1: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "add x10, x10, #0x10\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "29:" // Height 1: Multiply loop: Main loop skip + "cbz x11, 31f\n" + "30:" // Height 1: Multiply loop: Odd block loop + "ldr h0, [x10], #0x2\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "sub x11, x11, #0x1\n" + "add x15, x15, #0x40\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "cbnz x11, 30b\n" + "31:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 24b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 32f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "32:" // Height 1: No activation + "cmp x16, #0x20\n" + "bge 49f\n" + "tbz x16, #4, 40f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v9.8h }, [x13], #0x10\n" + "tbz x16, #3, 36f\n" + "st1 { v10.8h }, [x13], #0x10\n" + "tbz x16, #2, 34f\n" + "str d11, [x13], #0x8\n" + "tbz x16, #1, 33f\n" + "st1 { v11.s }[2], [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v11.h }[6], [x13]\n" + "b 48f\n" + "33:" // Height 1: Partial direct writeback: partial_1_28 + "tbz x16, #0, 48f\n" + "st1 { v11.h }[4], [x13]\n" + "b 48f\n" + "34:" // Height 1: Partial direct writeback: partial_2_24 + "tbz x16, #1, 35f\n" + "str s11, [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v11.h }[2], [x13]\n" + "b 48f\n" + "35:" // Height 1: Partial direct writeback: partial_1_24 + "tbz x16, #0, 48f\n" + "str h11, [x13, #0x0]\n" + "b 48f\n" + "36:" // Height 1: Partial direct writeback: partial_4_16 + "tbz x16, #2, 38f\n" + "str d10, [x13], #0x8\n" + "tbz x16, #1, 37f\n" + "st1 { v10.s }[2], [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v10.h }[6], [x13]\n" + "b 48f\n" + "37:" // Height 1: Partial direct writeback: partial_1_20 + "tbz x16, #0, 48f\n" + "st1 { v10.h }[4], [x13]\n" + "b 48f\n" + "38:" // Height 1: Partial direct writeback: partial_2_16 + "tbz x16, #1, 39f\n" + "str s10, [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v10.h }[2], [x13]\n" + "b 48f\n" + "39:" // Height 1: Partial direct writeback: partial_1_16 + "tbz x16, #0, 48f\n" + "str h10, [x13, #0x0]\n" + "b 48f\n" + "40:" // Height 1: Partial direct writeback: partial_8_0 + "tbz x16, #3, 44f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "tbz x16, #2, 42f\n" + "str d9, [x13], #0x8\n" + "tbz x16, #1, 41f\n" + "st1 { v9.s }[2], [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v9.h }[6], [x13]\n" + "b 48f\n" + "41:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x16, #0, 48f\n" + "st1 { v9.h }[4], [x13]\n" + "b 48f\n" + "42:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x16, #1, 43f\n" + "str s9, [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v9.h }[2], [x13]\n" + "b 48f\n" + "43:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x16, #0, 48f\n" + "str h9, [x13, #0x0]\n" + "b 48f\n" + "44:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x16, #2, 46f\n" + "str d8, [x13], #0x8\n" + "tbz x16, #1, 45f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v8.h }[6], [x13]\n" + "b 48f\n" + "45:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x16, #0, 48f\n" + "st1 { v8.h }[4], [x13]\n" + "b 48f\n" + "46:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x16, #1, 47f\n" + "str s8, [x13], #0x4\n" + "tbz x16, #0, 48f\n" + "st1 { v8.h }[2], [x13]\n" + "b 48f\n" + "47:" // Height 1: Partial direct writeback: partial_1_0 + "str h8, [x13, #0x0]\n" + "48:" // Height 1: Partial direct writeback: Done + "b 50f\n" + "49:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "50:" // Height 1: Writeback done + "subs x16, x16, #0x20\n" + "bgt 3b\n" + "b 302f\n" + "51:" // Height 2 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 52f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #1\n" + "b 53f\n" + "52:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "53:" // Height 2: Column loop + "cbz x14, 54f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "mov v13.16b, v9.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v14.16b, v10.16b\n" + "add x14, x14, #0x40\n" + "mov v15.16b, v11.16b\n" + "b 73f\n" + "54:" // Height 2: no bias + "tbz %x[flags], #0, 72f\n" + "cmp x16, #0x20\n" + "bge 71f\n" + "tbz x16, #4, 62f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v9.8h }, [x13], #0x10\n" + "ld1 { v13.8h }, [x9], #0x10\n" + "tbz x16, #3, 58f\n" + "ld1 { v10.8h }, [x13], #0x10\n" + "ld1 { v14.8h }, [x9], #0x10\n" + "tbz x16, #2, 56f\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "tbz x16, #1, 55f\n" + "mov x19, #0x3c\n" + "ld1 { v11.s }[2], [x13], #0x4\n" + "ld1 { v15.s }[2], [x9], #0x4\n" + "tbz x16, #0, 70f\n" + "ld1 { v11.h }[6], [x13]\n" + "ld1 { v15.h }[6], [x9]\n" + "b 70f\n" + "55:" // Height 2: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x16, #0, 70f\n" + "ld1 { v11.h }[4], [x13]\n" + "ld1 { v15.h }[4], [x9]\n" + "b 70f\n" + "56:" // Height 2: Partial accumulate: partial_2_24 + "tbz x16, #1, 57f\n" + "ldr s11, [x13], #0x4\n" + "ldr s15, [x9], #0x4\n" + "mov x19, #0x34\n" + "tbz x16, #0, 70f\n" + "ld1 { v11.h }[2], [x13]\n" + "ld1 { v15.h }[2], [x9]\n" + "b 70f\n" + "57:" // Height 2: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x16, #0, 70f\n" + "ldr h11, [x13, #0x0]\n" + "ldr h15, [x9, #0x0]\n" + "b 70f\n" + "58:" // Height 2: Partial accumulate: partial_4_16 + "tbz x16, #2, 60f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "tbz x16, #1, 59f\n" + "mov x19, #0x2c\n" + "ld1 { v10.s }[2], [x13], #0x4\n" + "ld1 { v14.s }[2], [x9], #0x4\n" + "tbz x16, #0, 70f\n" + "ld1 { v10.h }[6], [x13]\n" + "ld1 { v14.h }[6], [x9]\n" + "b 70f\n" + "59:" // Height 2: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x16, #0, 70f\n" + "ld1 { v10.h }[4], [x13]\n" + "ld1 { v14.h }[4], [x9]\n" + "b 70f\n" + "60:" // Height 2: Partial accumulate: partial_2_16 + "tbz x16, #1, 61f\n" + "ldr s10, [x13], #0x4\n" + "ldr s14, [x9], #0x4\n" + "mov x19, #0x24\n" + "tbz x16, #0, 70f\n" + "ld1 { v10.h }[2], [x13]\n" + "ld1 { v14.h }[2], [x9]\n" + "b 70f\n" + "61:" // Height 2: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x16, #0, 70f\n" + "ldr h10, [x13, #0x0]\n" + "ldr h14, [x9, #0x0]\n" + "b 70f\n" + "62:" // Height 2: Partial accumulate: partial_8_0 + "tbz x16, #3, 66f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "tbz x16, #2, 64f\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "tbz x16, #1, 63f\n" + "mov x19, #0x1c\n" + "ld1 { v9.s }[2], [x13], #0x4\n" + "ld1 { v13.s }[2], [x9], #0x4\n" + "tbz x16, #0, 70f\n" + "ld1 { v9.h }[6], [x13]\n" + "ld1 { v13.h }[6], [x9]\n" + "b 70f\n" + "63:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x16, #0, 70f\n" + "ld1 { v9.h }[4], [x13]\n" + "ld1 { v13.h }[4], [x9]\n" + "b 70f\n" + "64:" // Height 2: Partial accumulate: partial_2_8 + "tbz x16, #1, 65f\n" + "ldr s9, [x13], #0x4\n" + "ldr s13, [x9], #0x4\n" + "mov x19, #0x14\n" + "tbz x16, #0, 70f\n" + "ld1 { v9.h }[2], [x13]\n" + "ld1 { v13.h }[2], [x9]\n" + "b 70f\n" + "65:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x16, #0, 70f\n" + "ldr h9, [x13, #0x0]\n" + "ldr h13, [x9, #0x0]\n" + "b 70f\n" + "66:" // Height 2: Partial accumulate: partial_4_0 + "tbz x16, #2, 68f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "tbz x16, #1, 67f\n" + "mov x19, #0xc\n" + "ld1 { v8.s }[2], [x13], #0x4\n" + "ld1 { v12.s }[2], [x9], #0x4\n" + "tbz x16, #0, 70f\n" + "ld1 { v8.h }[6], [x13]\n" + "ld1 { v12.h }[6], [x9]\n" + "b 70f\n" + "67:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x16, #0, 70f\n" + "ld1 { v8.h }[4], [x13]\n" + "ld1 { v12.h }[4], [x9]\n" + "b 70f\n" + "68:" // Height 2: Partial accumulate: partial_2_0 + "tbz x16, #1, 69f\n" + "ldr s8, [x13], #0x4\n" + "ldr s12, [x9], #0x4\n" + "mov x19, #0x4\n" + "tbz x16, #0, 70f\n" + "ld1 { v8.h }[2], [x13]\n" + "ld1 { v12.h }[2], [x9]\n" + "b 70f\n" + "69:" // Height 2: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr h8, [x13, #0x0]\n" + "ldr h12, [x9, #0x0]\n" + "70:" // Height 2: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "b 73f\n" + "71:" // Height 2: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "b 73f\n" + "72:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "73:" // Height 2: setup done + "mov x12, #0x0\n" + "74:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 75f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 76f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "b 76f\n" + "75:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "76:" // Height 2: input setup done + "cmp x11, #0x8\n" + "blt 79f\n" + "cmp x11, #0x10\n" + "blt 78f\n" + "77:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "add x10, x10, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "sub x11, x11, #0x8\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "cmp x11, #0x10\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "bge 77b\n" + "78:" // Height 2: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "add x10, x10, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "79:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 81f\n" + "80:" // Height 2: Multiply loop: Odd block loop + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "sub x11, x11, #0x1\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "cbnz x11, 80b\n" + "81:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 74b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 82f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "82:" // Height 2: No activation + "cmp x16, #0x20\n" + "bge 99f\n" + "tbz x16, #4, 90f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v9.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v13.8h }, [x9], #0x10\n" + "tbz x16, #3, 86f\n" + "st1 { v10.8h }, [x13], #0x10\n" + "st1 { v14.8h }, [x9], #0x10\n" + "tbz x16, #2, 84f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "tbz x16, #1, 83f\n" + "st1 { v11.s }[2], [x13], #0x4\n" + "st1 { v15.s }[2], [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v11.h }[6], [x13]\n" + "st1 { v15.h }[6], [x9]\n" + "b 98f\n" + "83:" // Height 2: Partial direct writeback: partial_1_28 + "tbz x16, #0, 98f\n" + "st1 { v11.h }[4], [x13]\n" + "st1 { v15.h }[4], [x9]\n" + "b 98f\n" + "84:" // Height 2: Partial direct writeback: partial_2_24 + "tbz x16, #1, 85f\n" + "str s11, [x13], #0x4\n" + "str s15, [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v11.h }[2], [x13]\n" + "st1 { v15.h }[2], [x9]\n" + "b 98f\n" + "85:" // Height 2: Partial direct writeback: partial_1_24 + "tbz x16, #0, 98f\n" + "str h11, [x13, #0x0]\n" + "str h15, [x9, #0x0]\n" + "b 98f\n" + "86:" // Height 2: Partial direct writeback: partial_4_16 + "tbz x16, #2, 88f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "tbz x16, #1, 87f\n" + "st1 { v10.s }[2], [x13], #0x4\n" + "st1 { v14.s }[2], [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v10.h }[6], [x13]\n" + "st1 { v14.h }[6], [x9]\n" + "b 98f\n" + "87:" // Height 2: Partial direct writeback: partial_1_20 + "tbz x16, #0, 98f\n" + "st1 { v10.h }[4], [x13]\n" + "st1 { v14.h }[4], [x9]\n" + "b 98f\n" + "88:" // Height 2: Partial direct writeback: partial_2_16 + "tbz x16, #1, 89f\n" + "str s10, [x13], #0x4\n" + "str s14, [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v10.h }[2], [x13]\n" + "st1 { v14.h }[2], [x9]\n" + "b 98f\n" + "89:" // Height 2: Partial direct writeback: partial_1_16 + "tbz x16, #0, 98f\n" + "str h10, [x13, #0x0]\n" + "str h14, [x9, #0x0]\n" + "b 98f\n" + "90:" // Height 2: Partial direct writeback: partial_8_0 + "tbz x16, #3, 94f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "tbz x16, #2, 92f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "tbz x16, #1, 91f\n" + "st1 { v9.s }[2], [x13], #0x4\n" + "st1 { v13.s }[2], [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v9.h }[6], [x13]\n" + "st1 { v13.h }[6], [x9]\n" + "b 98f\n" + "91:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x16, #0, 98f\n" + "st1 { v9.h }[4], [x13]\n" + "st1 { v13.h }[4], [x9]\n" + "b 98f\n" + "92:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x16, #1, 93f\n" + "str s9, [x13], #0x4\n" + "str s13, [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v9.h }[2], [x13]\n" + "st1 { v13.h }[2], [x9]\n" + "b 98f\n" + "93:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x16, #0, 98f\n" + "str h9, [x13, #0x0]\n" + "str h13, [x9, #0x0]\n" + "b 98f\n" + "94:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x16, #2, 96f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "tbz x16, #1, 95f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v8.h }[6], [x13]\n" + "st1 { v12.h }[6], [x9]\n" + "b 98f\n" + "95:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x16, #0, 98f\n" + "st1 { v8.h }[4], [x13]\n" + "st1 { v12.h }[4], [x9]\n" + "b 98f\n" + "96:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x16, #1, 97f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "tbz x16, #0, 98f\n" + "st1 { v8.h }[2], [x13]\n" + "st1 { v12.h }[2], [x9]\n" + "b 98f\n" + "97:" // Height 2: Partial direct writeback: partial_1_0 + "str h8, [x13, #0x0]\n" + "str h12, [x9, #0x0]\n" + "98:" // Height 2: Partial direct writeback: Done + "b 100f\n" + "99:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "100:" // Height 2: Writeback done + "subs x16, x16, #0x20\n" + "bgt 53b\n" + "b 302f\n" + "101:" // Height 3 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 102f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "add x27, x27, x19, LSL #1\n" + "b 103f\n" + "102:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "103:" // Height 3: Column loop + "cbz x14, 104f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "ldr q11, [x14, #0x30]\n" + "mov v13.16b, v9.16b\n" + "add x14, x14, #0x40\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "b 123f\n" + "104:" // Height 3: no bias + "tbz %x[flags], #0, 122f\n" + "cmp x16, #0x20\n" + "bge 121f\n" + "tbz x16, #4, 112f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "ld1 { v9.8h }, [x13], #0x10\n" + "ld1 { v13.8h }, [x9], #0x10\n" + "ld1 { v17.8h }, [x27], #0x10\n" + "tbz x16, #3, 108f\n" + "ld1 { v10.8h }, [x13], #0x10\n" + "ld1 { v14.8h }, [x9], #0x10\n" + "ld1 { v18.8h }, [x27], #0x10\n" + "tbz x16, #2, 106f\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "tbz x16, #1, 105f\n" + "mov x19, #0x3c\n" + "ld1 { v11.s }[2], [x13], #0x4\n" + "ld1 { v15.s }[2], [x9], #0x4\n" + "ld1 { v19.s }[2], [x27], #0x4\n" + "tbz x16, #0, 120f\n" + "ld1 { v11.h }[6], [x13]\n" + "ld1 { v15.h }[6], [x9]\n" + "ld1 { v19.h }[6], [x27]\n" + "b 120f\n" + "105:" // Height 3: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x16, #0, 120f\n" + "ld1 { v11.h }[4], [x13]\n" + "ld1 { v15.h }[4], [x9]\n" + "ld1 { v19.h }[4], [x27]\n" + "b 120f\n" + "106:" // Height 3: Partial accumulate: partial_2_24 + "tbz x16, #1, 107f\n" + "ldr s11, [x13], #0x4\n" + "ldr s15, [x9], #0x4\n" + "ldr s19, [x27], #0x4\n" + "mov x19, #0x34\n" + "tbz x16, #0, 120f\n" + "ld1 { v11.h }[2], [x13]\n" + "ld1 { v15.h }[2], [x9]\n" + "ld1 { v19.h }[2], [x27]\n" + "b 120f\n" + "107:" // Height 3: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x16, #0, 120f\n" + "ldr h11, [x13, #0x0]\n" + "ldr h15, [x9, #0x0]\n" + "ldr h19, [x27, #0x0]\n" + "b 120f\n" + "108:" // Height 3: Partial accumulate: partial_4_16 + "tbz x16, #2, 110f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "tbz x16, #1, 109f\n" + "mov x19, #0x2c\n" + "ld1 { v10.s }[2], [x13], #0x4\n" + "ld1 { v14.s }[2], [x9], #0x4\n" + "ld1 { v18.s }[2], [x27], #0x4\n" + "tbz x16, #0, 120f\n" + "ld1 { v10.h }[6], [x13]\n" + "ld1 { v14.h }[6], [x9]\n" + "ld1 { v18.h }[6], [x27]\n" + "b 120f\n" + "109:" // Height 3: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x16, #0, 120f\n" + "ld1 { v10.h }[4], [x13]\n" + "ld1 { v14.h }[4], [x9]\n" + "ld1 { v18.h }[4], [x27]\n" + "b 120f\n" + "110:" // Height 3: Partial accumulate: partial_2_16 + "tbz x16, #1, 111f\n" + "ldr s10, [x13], #0x4\n" + "ldr s14, [x9], #0x4\n" + "ldr s18, [x27], #0x4\n" + "mov x19, #0x24\n" + "tbz x16, #0, 120f\n" + "ld1 { v10.h }[2], [x13]\n" + "ld1 { v14.h }[2], [x9]\n" + "ld1 { v18.h }[2], [x27]\n" + "b 120f\n" + "111:" // Height 3: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x16, #0, 120f\n" + "ldr h10, [x13, #0x0]\n" + "ldr h14, [x9, #0x0]\n" + "ldr h18, [x27, #0x0]\n" + "b 120f\n" + "112:" // Height 3: Partial accumulate: partial_8_0 + "tbz x16, #3, 116f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "tbz x16, #2, 114f\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "tbz x16, #1, 113f\n" + "mov x19, #0x1c\n" + "ld1 { v9.s }[2], [x13], #0x4\n" + "ld1 { v13.s }[2], [x9], #0x4\n" + "ld1 { v17.s }[2], [x27], #0x4\n" + "tbz x16, #0, 120f\n" + "ld1 { v9.h }[6], [x13]\n" + "ld1 { v13.h }[6], [x9]\n" + "ld1 { v17.h }[6], [x27]\n" + "b 120f\n" + "113:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x16, #0, 120f\n" + "ld1 { v9.h }[4], [x13]\n" + "ld1 { v13.h }[4], [x9]\n" + "ld1 { v17.h }[4], [x27]\n" + "b 120f\n" + "114:" // Height 3: Partial accumulate: partial_2_8 + "tbz x16, #1, 115f\n" + "ldr s9, [x13], #0x4\n" + "ldr s13, [x9], #0x4\n" + "ldr s17, [x27], #0x4\n" + "mov x19, #0x14\n" + "tbz x16, #0, 120f\n" + "ld1 { v9.h }[2], [x13]\n" + "ld1 { v13.h }[2], [x9]\n" + "ld1 { v17.h }[2], [x27]\n" + "b 120f\n" + "115:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x16, #0, 120f\n" + "ldr h9, [x13, #0x0]\n" + "ldr h13, [x9, #0x0]\n" + "ldr h17, [x27, #0x0]\n" + "b 120f\n" + "116:" // Height 3: Partial accumulate: partial_4_0 + "tbz x16, #2, 118f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "tbz x16, #1, 117f\n" + "mov x19, #0xc\n" + "ld1 { v8.s }[2], [x13], #0x4\n" + "ld1 { v12.s }[2], [x9], #0x4\n" + "ld1 { v16.s }[2], [x27], #0x4\n" + "tbz x16, #0, 120f\n" + "ld1 { v8.h }[6], [x13]\n" + "ld1 { v12.h }[6], [x9]\n" + "ld1 { v16.h }[6], [x27]\n" + "b 120f\n" + "117:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x16, #0, 120f\n" + "ld1 { v8.h }[4], [x13]\n" + "ld1 { v12.h }[4], [x9]\n" + "ld1 { v16.h }[4], [x27]\n" + "b 120f\n" + "118:" // Height 3: Partial accumulate: partial_2_0 + "tbz x16, #1, 119f\n" + "ldr s8, [x13], #0x4\n" + "ldr s12, [x9], #0x4\n" + "ldr s16, [x27], #0x4\n" + "mov x19, #0x4\n" + "tbz x16, #0, 120f\n" + "ld1 { v8.h }[2], [x13]\n" + "ld1 { v12.h }[2], [x9]\n" + "ld1 { v16.h }[2], [x27]\n" + "b 120f\n" + "119:" // Height 3: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr h8, [x13, #0x0]\n" + "ldr h12, [x9, #0x0]\n" + "ldr h16, [x27, #0x0]\n" + "120:" // Height 3: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "b 123f\n" + "121:" // Height 3: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "b 123f\n" + "122:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "123:" // Height 3: setup done + "mov x12, #0x0\n" + "124:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 125f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 126f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "b 126f\n" + "125:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "126:" // Height 3: input setup done + "cmp x11, #0x8\n" + "blt 129f\n" + "cmp x11, #0x10\n" + "blt 128f\n" + "127:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "sub x11, x11, #0x8\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "cmp x11, #0x10\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "bge 127b\n" + "128:" // Height 3: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "129:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 131f\n" + "130:" // Height 3: Multiply loop: Odd block loop + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "cbnz x11, 130b\n" + "131:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 124b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 132f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "132:" // Height 3: No activation + "cmp x16, #0x20\n" + "bge 149f\n" + "tbz x16, #4, 140f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v9.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v13.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "st1 { v17.8h }, [x27], #0x10\n" + "tbz x16, #3, 136f\n" + "st1 { v10.8h }, [x13], #0x10\n" + "st1 { v14.8h }, [x9], #0x10\n" + "st1 { v18.8h }, [x27], #0x10\n" + "tbz x16, #2, 134f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "tbz x16, #1, 133f\n" + "st1 { v11.s }[2], [x13], #0x4\n" + "st1 { v15.s }[2], [x9], #0x4\n" + "st1 { v19.s }[2], [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v11.h }[6], [x13]\n" + "st1 { v15.h }[6], [x9]\n" + "st1 { v19.h }[6], [x27]\n" + "b 148f\n" + "133:" // Height 3: Partial direct writeback: partial_1_28 + "tbz x16, #0, 148f\n" + "st1 { v11.h }[4], [x13]\n" + "st1 { v15.h }[4], [x9]\n" + "st1 { v19.h }[4], [x27]\n" + "b 148f\n" + "134:" // Height 3: Partial direct writeback: partial_2_24 + "tbz x16, #1, 135f\n" + "str s11, [x13], #0x4\n" + "str s15, [x9], #0x4\n" + "str s19, [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v11.h }[2], [x13]\n" + "st1 { v15.h }[2], [x9]\n" + "st1 { v19.h }[2], [x27]\n" + "b 148f\n" + "135:" // Height 3: Partial direct writeback: partial_1_24 + "tbz x16, #0, 148f\n" + "str h11, [x13, #0x0]\n" + "str h15, [x9, #0x0]\n" + "str h19, [x27, #0x0]\n" + "b 148f\n" + "136:" // Height 3: Partial direct writeback: partial_4_16 + "tbz x16, #2, 138f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "tbz x16, #1, 137f\n" + "st1 { v10.s }[2], [x13], #0x4\n" + "st1 { v14.s }[2], [x9], #0x4\n" + "st1 { v18.s }[2], [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v10.h }[6], [x13]\n" + "st1 { v14.h }[6], [x9]\n" + "st1 { v18.h }[6], [x27]\n" + "b 148f\n" + "137:" // Height 3: Partial direct writeback: partial_1_20 + "tbz x16, #0, 148f\n" + "st1 { v10.h }[4], [x13]\n" + "st1 { v14.h }[4], [x9]\n" + "st1 { v18.h }[4], [x27]\n" + "b 148f\n" + "138:" // Height 3: Partial direct writeback: partial_2_16 + "tbz x16, #1, 139f\n" + "str s10, [x13], #0x4\n" + "str s14, [x9], #0x4\n" + "str s18, [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v10.h }[2], [x13]\n" + "st1 { v14.h }[2], [x9]\n" + "st1 { v18.h }[2], [x27]\n" + "b 148f\n" + "139:" // Height 3: Partial direct writeback: partial_1_16 + "tbz x16, #0, 148f\n" + "str h10, [x13, #0x0]\n" + "str h14, [x9, #0x0]\n" + "str h18, [x27, #0x0]\n" + "b 148f\n" + "140:" // Height 3: Partial direct writeback: partial_8_0 + "tbz x16, #3, 144f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "tbz x16, #2, 142f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "tbz x16, #1, 141f\n" + "st1 { v9.s }[2], [x13], #0x4\n" + "st1 { v13.s }[2], [x9], #0x4\n" + "st1 { v17.s }[2], [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v9.h }[6], [x13]\n" + "st1 { v13.h }[6], [x9]\n" + "st1 { v17.h }[6], [x27]\n" + "b 148f\n" + "141:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x16, #0, 148f\n" + "st1 { v9.h }[4], [x13]\n" + "st1 { v13.h }[4], [x9]\n" + "st1 { v17.h }[4], [x27]\n" + "b 148f\n" + "142:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x16, #1, 143f\n" + "str s9, [x13], #0x4\n" + "str s13, [x9], #0x4\n" + "str s17, [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v9.h }[2], [x13]\n" + "st1 { v13.h }[2], [x9]\n" + "st1 { v17.h }[2], [x27]\n" + "b 148f\n" + "143:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x16, #0, 148f\n" + "str h9, [x13, #0x0]\n" + "str h13, [x9, #0x0]\n" + "str h17, [x27, #0x0]\n" + "b 148f\n" + "144:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x16, #2, 146f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "tbz x16, #1, 145f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v8.h }[6], [x13]\n" + "st1 { v12.h }[6], [x9]\n" + "st1 { v16.h }[6], [x27]\n" + "b 148f\n" + "145:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x16, #0, 148f\n" + "st1 { v8.h }[4], [x13]\n" + "st1 { v12.h }[4], [x9]\n" + "st1 { v16.h }[4], [x27]\n" + "b 148f\n" + "146:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x16, #1, 147f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "tbz x16, #0, 148f\n" + "st1 { v8.h }[2], [x13]\n" + "st1 { v12.h }[2], [x9]\n" + "st1 { v16.h }[2], [x27]\n" + "b 148f\n" + "147:" // Height 3: Partial direct writeback: partial_1_0 + "str h8, [x13, #0x0]\n" + "str h12, [x9, #0x0]\n" + "str h16, [x27, #0x0]\n" + "148:" // Height 3: Partial direct writeback: Done + "b 150f\n" + "149:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "150:" // Height 3: Writeback done + "subs x16, x16, #0x20\n" + "bgt 103b\n" + "b 302f\n" + "151:" // Height 4 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 152f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" + "b 153f\n" + "152:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "add x25, x27, x19, LSL #1\n" + "153:" // Height 4: Column loop + "cbz x14, 154f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "add x14, x14, #0x40\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "b 173f\n" + "154:" // Height 4: no bias + "tbz %x[flags], #0, 172f\n" + "cmp x16, #0x20\n" + "bge 171f\n" + "tbz x16, #4, 162f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "ld1 { v20.8h }, [x25], #0x10\n" + "ld1 { v9.8h }, [x13], #0x10\n" + "ld1 { v13.8h }, [x9], #0x10\n" + "ld1 { v17.8h }, [x27], #0x10\n" + "ld1 { v21.8h }, [x25], #0x10\n" + "tbz x16, #3, 158f\n" + "ld1 { v10.8h }, [x13], #0x10\n" + "ld1 { v14.8h }, [x9], #0x10\n" + "ld1 { v18.8h }, [x27], #0x10\n" + "ld1 { v22.8h }, [x25], #0x10\n" + "tbz x16, #2, 156f\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "tbz x16, #1, 155f\n" + "mov x19, #0x3c\n" + "ld1 { v11.s }[2], [x13], #0x4\n" + "ld1 { v15.s }[2], [x9], #0x4\n" + "ld1 { v19.s }[2], [x27], #0x4\n" + "ld1 { v23.s }[2], [x25], #0x4\n" + "tbz x16, #0, 170f\n" + "ld1 { v11.h }[6], [x13]\n" + "ld1 { v15.h }[6], [x9]\n" + "ld1 { v19.h }[6], [x27]\n" + "ld1 { v23.h }[6], [x25]\n" + "b 170f\n" + "155:" // Height 4: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x16, #0, 170f\n" + "ld1 { v11.h }[4], [x13]\n" + "ld1 { v15.h }[4], [x9]\n" + "ld1 { v19.h }[4], [x27]\n" + "ld1 { v23.h }[4], [x25]\n" + "b 170f\n" + "156:" // Height 4: Partial accumulate: partial_2_24 + "tbz x16, #1, 157f\n" + "ldr s11, [x13], #0x4\n" + "ldr s15, [x9], #0x4\n" + "ldr s19, [x27], #0x4\n" + "ldr s23, [x25], #0x4\n" + "mov x19, #0x34\n" + "tbz x16, #0, 170f\n" + "ld1 { v11.h }[2], [x13]\n" + "ld1 { v15.h }[2], [x9]\n" + "ld1 { v19.h }[2], [x27]\n" + "ld1 { v23.h }[2], [x25]\n" + "b 170f\n" + "157:" // Height 4: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x16, #0, 170f\n" + "ldr h11, [x13, #0x0]\n" + "ldr h15, [x9, #0x0]\n" + "ldr h19, [x27, #0x0]\n" + "ldr h23, [x25, #0x0]\n" + "b 170f\n" + "158:" // Height 4: Partial accumulate: partial_4_16 + "tbz x16, #2, 160f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "tbz x16, #1, 159f\n" + "ld1 { v10.s }[2], [x13], #0x4\n" + "ld1 { v14.s }[2], [x9], #0x4\n" + "ld1 { v18.s }[2], [x27], #0x4\n" + "ld1 { v22.s }[2], [x25], #0x4\n" + "mov x19, #0x2c\n" + "tbz x16, #0, 170f\n" + "ld1 { v10.h }[6], [x13]\n" + "ld1 { v14.h }[6], [x9]\n" + "ld1 { v18.h }[6], [x27]\n" + "ld1 { v22.h }[6], [x25]\n" + "b 170f\n" + "159:" // Height 4: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x16, #0, 170f\n" + "ld1 { v10.h }[4], [x13]\n" + "ld1 { v14.h }[4], [x9]\n" + "ld1 { v18.h }[4], [x27]\n" + "ld1 { v22.h }[4], [x25]\n" + "b 170f\n" + "160:" // Height 4: Partial accumulate: partial_2_16 + "tbz x16, #1, 161f\n" + "ldr s10, [x13], #0x4\n" + "ldr s14, [x9], #0x4\n" + "ldr s18, [x27], #0x4\n" + "ldr s22, [x25], #0x4\n" + "mov x19, #0x24\n" + "tbz x16, #0, 170f\n" + "ld1 { v10.h }[2], [x13]\n" + "ld1 { v14.h }[2], [x9]\n" + "ld1 { v18.h }[2], [x27]\n" + "ld1 { v22.h }[2], [x25]\n" + "b 170f\n" + "161:" // Height 4: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x16, #0, 170f\n" + "ldr h10, [x13, #0x0]\n" + "ldr h14, [x9, #0x0]\n" + "ldr h18, [x27, #0x0]\n" + "ldr h22, [x25, #0x0]\n" + "b 170f\n" + "162:" // Height 4: Partial accumulate: partial_8_0 + "tbz x16, #3, 166f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "ld1 { v20.8h }, [x25], #0x10\n" + "tbz x16, #2, 164f\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "tbz x16, #1, 163f\n" + "mov x19, #0x1c\n" + "ld1 { v9.s }[2], [x13], #0x4\n" + "ld1 { v13.s }[2], [x9], #0x4\n" + "ld1 { v17.s }[2], [x27], #0x4\n" + "ld1 { v21.s }[2], [x25], #0x4\n" + "tbz x16, #0, 170f\n" + "ld1 { v9.h }[6], [x13]\n" + "ld1 { v13.h }[6], [x9]\n" + "ld1 { v17.h }[6], [x27]\n" + "ld1 { v21.h }[6], [x25]\n" + "b 170f\n" + "163:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x16, #0, 170f\n" + "ld1 { v9.h }[4], [x13]\n" + "ld1 { v13.h }[4], [x9]\n" + "ld1 { v17.h }[4], [x27]\n" + "ld1 { v21.h }[4], [x25]\n" + "b 170f\n" + "164:" // Height 4: Partial accumulate: partial_2_8 + "tbz x16, #1, 165f\n" + "ldr s9, [x13], #0x4\n" + "ldr s13, [x9], #0x4\n" + "ldr s17, [x27], #0x4\n" + "ldr s21, [x25], #0x4\n" + "mov x19, #0x14\n" + "tbz x16, #0, 170f\n" + "ld1 { v9.h }[2], [x13]\n" + "ld1 { v13.h }[2], [x9]\n" + "ld1 { v17.h }[2], [x27]\n" + "ld1 { v21.h }[2], [x25]\n" + "b 170f\n" + "165:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x16, #0, 170f\n" + "ldr h9, [x13, #0x0]\n" + "ldr h13, [x9, #0x0]\n" + "ldr h17, [x27, #0x0]\n" + "ldr h21, [x25, #0x0]\n" + "b 170f\n" + "166:" // Height 4: Partial accumulate: partial_4_0 + "tbz x16, #2, 168f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "tbz x16, #1, 167f\n" + "ld1 { v8.s }[2], [x13], #0x4\n" + "ld1 { v12.s }[2], [x9], #0x4\n" + "ld1 { v16.s }[2], [x27], #0x4\n" + "ld1 { v20.s }[2], [x25], #0x4\n" + "mov x19, #0xc\n" + "tbz x16, #0, 170f\n" + "ld1 { v8.h }[6], [x13]\n" + "ld1 { v12.h }[6], [x9]\n" + "ld1 { v16.h }[6], [x27]\n" + "ld1 { v20.h }[6], [x25]\n" + "b 170f\n" + "167:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x16, #0, 170f\n" + "ld1 { v8.h }[4], [x13]\n" + "ld1 { v12.h }[4], [x9]\n" + "ld1 { v16.h }[4], [x27]\n" + "ld1 { v20.h }[4], [x25]\n" + "b 170f\n" + "168:" // Height 4: Partial accumulate: partial_2_0 + "tbz x16, #1, 169f\n" + "ldr s8, [x13], #0x4\n" + "ldr s12, [x9], #0x4\n" + "ldr s16, [x27], #0x4\n" + "ldr s20, [x25], #0x4\n" + "mov x19, #0x4\n" + "tbz x16, #0, 170f\n" + "ld1 { v8.h }[2], [x13]\n" + "ld1 { v12.h }[2], [x9]\n" + "ld1 { v16.h }[2], [x27]\n" + "ld1 { v20.h }[2], [x25]\n" + "b 170f\n" + "169:" // Height 4: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr h8, [x13, #0x0]\n" + "ldr h12, [x9, #0x0]\n" + "ldr h16, [x27, #0x0]\n" + "ldr h20, [x25, #0x0]\n" + "170:" // Height 4: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "b 173f\n" + "171:" // Height 4: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "b 173f\n" + "172:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "173:" // Height 4: setup done + "mov x12, #0x0\n" + "174:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 175f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 176f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 176f\n" + "175:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "176:" // Height 4: input setup done + "cmp x11, #0x8\n" + "blt 179f\n" + "cmp x11, #0x10\n" + "blt 178f\n" + "177:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x11, x11, #0x8\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "cmp x11, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "bge 177b\n" + "178:" // Height 4: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "add x15, x15, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "179:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 181f\n" + "180:" // Height 4: Multiply loop: Odd block loop + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "cbnz x11, 180b\n" + "181:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 174b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 182f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v0.8h\n" + "fmin v22.8h, v22.8h, v0.8h\n" + "fmin v23.8h, v23.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v22.8h, v22.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "182:" // Height 4: No activation + "cmp x16, #0x20\n" + "bge 199f\n" + "tbz x16, #4, 190f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v9.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v13.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "st1 { v17.8h }, [x27], #0x10\n" + "st1 { v20.8h }, [x25], #0x10\n" + "st1 { v21.8h }, [x25], #0x10\n" + "tbz x16, #3, 186f\n" + "st1 { v10.8h }, [x13], #0x10\n" + "st1 { v14.8h }, [x9], #0x10\n" + "st1 { v18.8h }, [x27], #0x10\n" + "st1 { v22.8h }, [x25], #0x10\n" + "tbz x16, #2, 184f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "tbz x16, #1, 183f\n" + "st1 { v11.s }[2], [x13], #0x4\n" + "st1 { v15.s }[2], [x9], #0x4\n" + "st1 { v19.s }[2], [x27], #0x4\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v11.h }[6], [x13]\n" + "st1 { v15.h }[6], [x9]\n" + "st1 { v19.h }[6], [x27]\n" + "st1 { v23.h }[6], [x25]\n" + "b 198f\n" + "183:" // Height 4: Partial direct writeback: partial_1_28 + "tbz x16, #0, 198f\n" + "st1 { v11.h }[4], [x13]\n" + "st1 { v15.h }[4], [x9]\n" + "st1 { v19.h }[4], [x27]\n" + "st1 { v23.h }[4], [x25]\n" + "b 198f\n" + "184:" // Height 4: Partial direct writeback: partial_2_24 + "tbz x16, #1, 185f\n" + "str s11, [x13], #0x4\n" + "str s15, [x9], #0x4\n" + "str s19, [x27], #0x4\n" + "str s23, [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v11.h }[2], [x13]\n" + "st1 { v15.h }[2], [x9]\n" + "st1 { v19.h }[2], [x27]\n" + "st1 { v23.h }[2], [x25]\n" + "b 198f\n" + "185:" // Height 4: Partial direct writeback: partial_1_24 + "tbz x16, #0, 198f\n" + "str h11, [x13, #0x0]\n" + "str h15, [x9, #0x0]\n" + "str h19, [x27, #0x0]\n" + "str h23, [x25, #0x0]\n" + "b 198f\n" + "186:" // Height 4: Partial direct writeback: partial_4_16 + "tbz x16, #2, 188f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "tbz x16, #1, 187f\n" + "st1 { v10.s }[2], [x13], #0x4\n" + "st1 { v14.s }[2], [x9], #0x4\n" + "st1 { v18.s }[2], [x27], #0x4\n" + "st1 { v22.s }[2], [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v10.h }[6], [x13]\n" + "st1 { v14.h }[6], [x9]\n" + "st1 { v18.h }[6], [x27]\n" + "st1 { v22.h }[6], [x25]\n" + "b 198f\n" + "187:" // Height 4: Partial direct writeback: partial_1_20 + "tbz x16, #0, 198f\n" + "st1 { v10.h }[4], [x13]\n" + "st1 { v14.h }[4], [x9]\n" + "st1 { v18.h }[4], [x27]\n" + "st1 { v22.h }[4], [x25]\n" + "b 198f\n" + "188:" // Height 4: Partial direct writeback: partial_2_16 + "tbz x16, #1, 189f\n" + "str s10, [x13], #0x4\n" + "str s14, [x9], #0x4\n" + "str s18, [x27], #0x4\n" + "str s22, [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v10.h }[2], [x13]\n" + "st1 { v14.h }[2], [x9]\n" + "st1 { v18.h }[2], [x27]\n" + "st1 { v22.h }[2], [x25]\n" + "b 198f\n" + "189:" // Height 4: Partial direct writeback: partial_1_16 + "tbz x16, #0, 198f\n" + "str h10, [x13, #0x0]\n" + "str h14, [x9, #0x0]\n" + "str h18, [x27, #0x0]\n" + "str h22, [x25, #0x0]\n" + "b 198f\n" + "190:" // Height 4: Partial direct writeback: partial_8_0 + "tbz x16, #3, 194f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "st1 { v20.8h }, [x25], #0x10\n" + "tbz x16, #2, 192f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "tbz x16, #1, 191f\n" + "st1 { v9.s }[2], [x13], #0x4\n" + "st1 { v13.s }[2], [x9], #0x4\n" + "st1 { v17.s }[2], [x27], #0x4\n" + "st1 { v21.s }[2], [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v9.h }[6], [x13]\n" + "st1 { v13.h }[6], [x9]\n" + "st1 { v17.h }[6], [x27]\n" + "st1 { v21.h }[6], [x25]\n" + "b 198f\n" + "191:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x16, #0, 198f\n" + "st1 { v9.h }[4], [x13]\n" + "st1 { v13.h }[4], [x9]\n" + "st1 { v17.h }[4], [x27]\n" + "st1 { v21.h }[4], [x25]\n" + "b 198f\n" + "192:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x16, #1, 193f\n" + "str s9, [x13], #0x4\n" + "str s13, [x9], #0x4\n" + "str s17, [x27], #0x4\n" + "str s21, [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v9.h }[2], [x13]\n" + "st1 { v13.h }[2], [x9]\n" + "st1 { v17.h }[2], [x27]\n" + "st1 { v21.h }[2], [x25]\n" + "b 198f\n" + "193:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x16, #0, 198f\n" + "str h9, [x13, #0x0]\n" + "str h13, [x9, #0x0]\n" + "str h17, [x27, #0x0]\n" + "str h21, [x25, #0x0]\n" + "b 198f\n" + "194:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x16, #2, 196f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x16, #1, 195f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v8.h }[6], [x13]\n" + "st1 { v12.h }[6], [x9]\n" + "st1 { v16.h }[6], [x27]\n" + "st1 { v20.h }[6], [x25]\n" + "b 198f\n" + "195:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x16, #0, 198f\n" + "st1 { v8.h }[4], [x13]\n" + "st1 { v12.h }[4], [x9]\n" + "st1 { v16.h }[4], [x27]\n" + "st1 { v20.h }[4], [x25]\n" + "b 198f\n" + "196:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x16, #1, 197f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "str s20, [x25], #0x4\n" + "tbz x16, #0, 198f\n" + "st1 { v8.h }[2], [x13]\n" + "st1 { v12.h }[2], [x9]\n" + "st1 { v16.h }[2], [x27]\n" + "st1 { v20.h }[2], [x25]\n" + "b 198f\n" + "197:" // Height 4: Partial direct writeback: partial_1_0 + "str h8, [x13, #0x0]\n" + "str h12, [x9, #0x0]\n" + "str h16, [x27, #0x0]\n" + "str h20, [x25, #0x0]\n" + "198:" // Height 4: Partial direct writeback: Done + "b 200f\n" + "199:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "200:" // Height 4: Writeback done + "subs x16, x16, #0x20\n" + "bgt 153b\n" + "b 302f\n" + "201:" // Height 5 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 202f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 203f\n" + "202:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "add x25, x27, x19, LSL #1\n" + "add x23, x25, x19, LSL #1\n" + "203:" // Height 5: Column loop + "cbz x14, 204f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v24.16b, v8.16b\n" + "add x14, x14, #0x40\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "b 223f\n" + "204:" // Height 5: no bias + "tbz %x[flags], #0, 222f\n" + "cmp x16, #0x20\n" + "bge 221f\n" + "tbz x16, #4, 212f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "ld1 { v20.8h }, [x25], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "ld1 { v9.8h }, [x13], #0x10\n" + "ld1 { v13.8h }, [x9], #0x10\n" + "ld1 { v17.8h }, [x27], #0x10\n" + "ld1 { v21.8h }, [x25], #0x10\n" + "ld1 { v25.8h }, [x23], #0x10\n" + "tbz x16, #3, 208f\n" + "ld1 { v10.8h }, [x13], #0x10\n" + "ld1 { v14.8h }, [x9], #0x10\n" + "ld1 { v18.8h }, [x27], #0x10\n" + "ld1 { v22.8h }, [x25], #0x10\n" + "ld1 { v26.8h }, [x23], #0x10\n" + "tbz x16, #2, 206f\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "tbz x16, #1, 205f\n" + "ld1 { v11.s }[2], [x13], #0x4\n" + "ld1 { v15.s }[2], [x9], #0x4\n" + "ld1 { v19.s }[2], [x27], #0x4\n" + "ld1 { v23.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "mov x19, #0x3c\n" + "tbz x16, #0, 220f\n" + "ld1 { v11.h }[6], [x13]\n" + "ld1 { v15.h }[6], [x9]\n" + "ld1 { v19.h }[6], [x27]\n" + "ld1 { v23.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x23]\n" + "b 220f\n" + "205:" // Height 5: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x16, #0, 220f\n" + "ld1 { v11.h }[4], [x13]\n" + "ld1 { v15.h }[4], [x9]\n" + "ld1 { v19.h }[4], [x27]\n" + "ld1 { v23.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x23]\n" + "b 220f\n" + "206:" // Height 5: Partial accumulate: partial_2_24 + "tbz x16, #1, 207f\n" + "ldr s11, [x13], #0x4\n" + "ldr s15, [x9], #0x4\n" + "ldr s19, [x27], #0x4\n" + "ldr s23, [x25], #0x4\n" + "ldr s27, [x23], #0x4\n" + "mov x19, #0x34\n" + "tbz x16, #0, 220f\n" + "ld1 { v11.h }[2], [x13]\n" + "ld1 { v15.h }[2], [x9]\n" + "ld1 { v19.h }[2], [x27]\n" + "ld1 { v23.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x23]\n" + "b 220f\n" + "207:" // Height 5: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x16, #0, 220f\n" + "ldr h11, [x13, #0x0]\n" + "ldr h15, [x9, #0x0]\n" + "ldr h19, [x27, #0x0]\n" + "ldr h23, [x25, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "b 220f\n" + "208:" // Height 5: Partial accumulate: partial_4_16 + "tbz x16, #2, 210f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "tbz x16, #1, 209f\n" + "ld1 { v10.s }[2], [x13], #0x4\n" + "ld1 { v14.s }[2], [x9], #0x4\n" + "ld1 { v18.s }[2], [x27], #0x4\n" + "ld1 { v22.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "mov x19, #0x2c\n" + "tbz x16, #0, 220f\n" + "ld1 { v10.h }[6], [x13]\n" + "ld1 { v14.h }[6], [x9]\n" + "ld1 { v18.h }[6], [x27]\n" + "ld1 { v22.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x23]\n" + "b 220f\n" + "209:" // Height 5: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x16, #0, 220f\n" + "ld1 { v10.h }[4], [x13]\n" + "ld1 { v14.h }[4], [x9]\n" + "ld1 { v18.h }[4], [x27]\n" + "ld1 { v22.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x23]\n" + "b 220f\n" + "210:" // Height 5: Partial accumulate: partial_2_16 + "tbz x16, #1, 211f\n" + "ldr s10, [x13], #0x4\n" + "ldr s14, [x9], #0x4\n" + "ldr s18, [x27], #0x4\n" + "ldr s22, [x25], #0x4\n" + "ldr s26, [x23], #0x4\n" + "mov x19, #0x24\n" + "tbz x16, #0, 220f\n" + "ld1 { v10.h }[2], [x13]\n" + "ld1 { v14.h }[2], [x9]\n" + "ld1 { v18.h }[2], [x27]\n" + "ld1 { v22.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x23]\n" + "b 220f\n" + "211:" // Height 5: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x16, #0, 220f\n" + "ldr h10, [x13, #0x0]\n" + "ldr h14, [x9, #0x0]\n" + "ldr h18, [x27, #0x0]\n" + "ldr h22, [x25, #0x0]\n" + "ldr h26, [x23, #0x0]\n" + "b 220f\n" + "212:" // Height 5: Partial accumulate: partial_8_0 + "tbz x16, #3, 216f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "ld1 { v20.8h }, [x25], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "tbz x16, #2, 214f\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "tbz x16, #1, 213f\n" + "ld1 { v9.s }[2], [x13], #0x4\n" + "ld1 { v13.s }[2], [x9], #0x4\n" + "ld1 { v17.s }[2], [x27], #0x4\n" + "ld1 { v21.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "mov x19, #0x1c\n" + "tbz x16, #0, 220f\n" + "ld1 { v9.h }[6], [x13]\n" + "ld1 { v13.h }[6], [x9]\n" + "ld1 { v17.h }[6], [x27]\n" + "ld1 { v21.h }[6], [x25]\n" + "ld1 { v25.h }[6], [x23]\n" + "b 220f\n" + "213:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x16, #0, 220f\n" + "ld1 { v9.h }[4], [x13]\n" + "ld1 { v13.h }[4], [x9]\n" + "ld1 { v17.h }[4], [x27]\n" + "ld1 { v21.h }[4], [x25]\n" + "ld1 { v25.h }[4], [x23]\n" + "b 220f\n" + "214:" // Height 5: Partial accumulate: partial_2_8 + "tbz x16, #1, 215f\n" + "ldr s9, [x13], #0x4\n" + "ldr s13, [x9], #0x4\n" + "ldr s17, [x27], #0x4\n" + "ldr s21, [x25], #0x4\n" + "ldr s25, [x23], #0x4\n" + "mov x19, #0x14\n" + "tbz x16, #0, 220f\n" + "ld1 { v9.h }[2], [x13]\n" + "ld1 { v13.h }[2], [x9]\n" + "ld1 { v17.h }[2], [x27]\n" + "ld1 { v21.h }[2], [x25]\n" + "ld1 { v25.h }[2], [x23]\n" + "b 220f\n" + "215:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x16, #0, 220f\n" + "ldr h9, [x13, #0x0]\n" + "ldr h13, [x9, #0x0]\n" + "ldr h17, [x27, #0x0]\n" + "ldr h21, [x25, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "b 220f\n" + "216:" // Height 5: Partial accumulate: partial_4_0 + "tbz x16, #2, 218f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "tbz x16, #1, 217f\n" + "ld1 { v8.s }[2], [x13], #0x4\n" + "ld1 { v12.s }[2], [x9], #0x4\n" + "ld1 { v16.s }[2], [x27], #0x4\n" + "ld1 { v20.s }[2], [x25], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "mov x19, #0xc\n" + "tbz x16, #0, 220f\n" + "ld1 { v8.h }[6], [x13]\n" + "ld1 { v12.h }[6], [x9]\n" + "ld1 { v16.h }[6], [x27]\n" + "ld1 { v20.h }[6], [x25]\n" + "ld1 { v24.h }[6], [x23]\n" + "b 220f\n" + "217:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x16, #0, 220f\n" + "ld1 { v8.h }[4], [x13]\n" + "ld1 { v12.h }[4], [x9]\n" + "ld1 { v16.h }[4], [x27]\n" + "ld1 { v20.h }[4], [x25]\n" + "ld1 { v24.h }[4], [x23]\n" + "b 220f\n" + "218:" // Height 5: Partial accumulate: partial_2_0 + "tbz x16, #1, 219f\n" + "ldr s8, [x13], #0x4\n" + "ldr s12, [x9], #0x4\n" + "ldr s16, [x27], #0x4\n" + "ldr s20, [x25], #0x4\n" + "ldr s24, [x23], #0x4\n" + "mov x19, #0x4\n" + "tbz x16, #0, 220f\n" + "ld1 { v8.h }[2], [x13]\n" + "ld1 { v12.h }[2], [x9]\n" + "ld1 { v16.h }[2], [x27]\n" + "ld1 { v20.h }[2], [x25]\n" + "ld1 { v24.h }[2], [x23]\n" + "b 220f\n" + "219:" // Height 5: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr h8, [x13, #0x0]\n" + "ldr h12, [x9, #0x0]\n" + "ldr h16, [x27, #0x0]\n" + "ldr h20, [x25, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "220:" // Height 5: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "b 223f\n" + "221:" // Height 5: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "b 223f\n" + "222:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "223:" // Height 5: setup done + "mov x12, #0x0\n" + "224:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 225f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 226f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 226f\n" + "225:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "226:" // Height 5: input setup done + "cmp x11, #0x8\n" + "blt 229f\n" + "cmp x11, #0x10\n" + "blt 228f\n" + "227:" // Height 5: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x22, x22, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x11, x11, #0x8\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "cmp x11, #0x10\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x15, x15, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "bge 227b\n" + "228:" // Height 5: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x22, x22, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x15, x15, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "229:" // Height 5: Multiply loop: Main loop skip + "cbz x11, 231f\n" + "230:" // Height 5: Multiply loop: Odd block loop + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "cbnz x11, 230b\n" + "231:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 224b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 232f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v0.8h\n" + "fmin v22.8h, v22.8h, v0.8h\n" + "fmin v23.8h, v23.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v22.8h, v22.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v0.8h\n" + "fmin v25.8h, v25.8h, v0.8h\n" + "fmin v26.8h, v26.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v1.8h\n" + "fmax v25.8h, v25.8h, v1.8h\n" + "fmax v26.8h, v26.8h, v1.8h\n" + "fmin v27.8h, v27.8h, v0.8h\n" + "fmax v27.8h, v27.8h, v1.8h\n" + "232:" // Height 5: No activation + "cmp x16, #0x20\n" + "bge 249f\n" + "tbz x16, #4, 240f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v9.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v13.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "st1 { v17.8h }, [x27], #0x10\n" + "st1 { v20.8h }, [x25], #0x10\n" + "st1 { v21.8h }, [x25], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "st1 { v25.8h }, [x23], #0x10\n" + "tbz x16, #3, 236f\n" + "st1 { v10.8h }, [x13], #0x10\n" + "st1 { v14.8h }, [x9], #0x10\n" + "st1 { v18.8h }, [x27], #0x10\n" + "st1 { v22.8h }, [x25], #0x10\n" + "st1 { v26.8h }, [x23], #0x10\n" + "tbz x16, #2, 234f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "tbz x16, #1, 233f\n" + "st1 { v11.s }[2], [x13], #0x4\n" + "st1 { v15.s }[2], [x9], #0x4\n" + "st1 { v19.s }[2], [x27], #0x4\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "st1 { v27.s }[2], [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v11.h }[6], [x13]\n" + "st1 { v15.h }[6], [x9]\n" + "st1 { v19.h }[6], [x27]\n" + "st1 { v23.h }[6], [x25]\n" + "st1 { v27.h }[6], [x23]\n" + "b 248f\n" + "233:" // Height 5: Partial direct writeback: partial_1_28 + "tbz x16, #0, 248f\n" + "st1 { v11.h }[4], [x13]\n" + "st1 { v15.h }[4], [x9]\n" + "st1 { v19.h }[4], [x27]\n" + "st1 { v23.h }[4], [x25]\n" + "st1 { v27.h }[4], [x23]\n" + "b 248f\n" + "234:" // Height 5: Partial direct writeback: partial_2_24 + "tbz x16, #1, 235f\n" + "str s11, [x13], #0x4\n" + "str s15, [x9], #0x4\n" + "str s19, [x27], #0x4\n" + "str s23, [x25], #0x4\n" + "str s27, [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v11.h }[2], [x13]\n" + "st1 { v15.h }[2], [x9]\n" + "st1 { v19.h }[2], [x27]\n" + "st1 { v23.h }[2], [x25]\n" + "st1 { v27.h }[2], [x23]\n" + "b 248f\n" + "235:" // Height 5: Partial direct writeback: partial_1_24 + "tbz x16, #0, 248f\n" + "str h11, [x13, #0x0]\n" + "str h15, [x9, #0x0]\n" + "str h19, [x27, #0x0]\n" + "str h23, [x25, #0x0]\n" + "str h27, [x23, #0x0]\n" + "b 248f\n" + "236:" // Height 5: Partial direct writeback: partial_4_16 + "tbz x16, #2, 238f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "tbz x16, #1, 237f\n" + "st1 { v10.s }[2], [x13], #0x4\n" + "st1 { v14.s }[2], [x9], #0x4\n" + "st1 { v18.s }[2], [x27], #0x4\n" + "st1 { v22.s }[2], [x25], #0x4\n" + "st1 { v26.s }[2], [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v10.h }[6], [x13]\n" + "st1 { v14.h }[6], [x9]\n" + "st1 { v18.h }[6], [x27]\n" + "st1 { v22.h }[6], [x25]\n" + "st1 { v26.h }[6], [x23]\n" + "b 248f\n" + "237:" // Height 5: Partial direct writeback: partial_1_20 + "tbz x16, #0, 248f\n" + "st1 { v10.h }[4], [x13]\n" + "st1 { v14.h }[4], [x9]\n" + "st1 { v18.h }[4], [x27]\n" + "st1 { v22.h }[4], [x25]\n" + "st1 { v26.h }[4], [x23]\n" + "b 248f\n" + "238:" // Height 5: Partial direct writeback: partial_2_16 + "tbz x16, #1, 239f\n" + "str s10, [x13], #0x4\n" + "str s14, [x9], #0x4\n" + "str s18, [x27], #0x4\n" + "str s22, [x25], #0x4\n" + "str s26, [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v10.h }[2], [x13]\n" + "st1 { v14.h }[2], [x9]\n" + "st1 { v18.h }[2], [x27]\n" + "st1 { v22.h }[2], [x25]\n" + "st1 { v26.h }[2], [x23]\n" + "b 248f\n" + "239:" // Height 5: Partial direct writeback: partial_1_16 + "tbz x16, #0, 248f\n" + "str h10, [x13, #0x0]\n" + "str h14, [x9, #0x0]\n" + "str h18, [x27, #0x0]\n" + "str h22, [x25, #0x0]\n" + "str h26, [x23, #0x0]\n" + "b 248f\n" + "240:" // Height 5: Partial direct writeback: partial_8_0 + "tbz x16, #3, 244f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "st1 { v20.8h }, [x25], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "tbz x16, #2, 242f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "tbz x16, #1, 241f\n" + "st1 { v9.s }[2], [x13], #0x4\n" + "st1 { v13.s }[2], [x9], #0x4\n" + "st1 { v17.s }[2], [x27], #0x4\n" + "st1 { v21.s }[2], [x25], #0x4\n" + "st1 { v25.s }[2], [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v9.h }[6], [x13]\n" + "st1 { v13.h }[6], [x9]\n" + "st1 { v17.h }[6], [x27]\n" + "st1 { v21.h }[6], [x25]\n" + "st1 { v25.h }[6], [x23]\n" + "b 248f\n" + "241:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x16, #0, 248f\n" + "st1 { v9.h }[4], [x13]\n" + "st1 { v13.h }[4], [x9]\n" + "st1 { v17.h }[4], [x27]\n" + "st1 { v21.h }[4], [x25]\n" + "st1 { v25.h }[4], [x23]\n" + "b 248f\n" + "242:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x16, #1, 243f\n" + "str s9, [x13], #0x4\n" + "str s13, [x9], #0x4\n" + "str s17, [x27], #0x4\n" + "str s21, [x25], #0x4\n" + "str s25, [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v9.h }[2], [x13]\n" + "st1 { v13.h }[2], [x9]\n" + "st1 { v17.h }[2], [x27]\n" + "st1 { v21.h }[2], [x25]\n" + "st1 { v25.h }[2], [x23]\n" + "b 248f\n" + "243:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x16, #0, 248f\n" + "str h9, [x13, #0x0]\n" + "str h13, [x9, #0x0]\n" + "str h17, [x27, #0x0]\n" + "str h21, [x25, #0x0]\n" + "str h25, [x23, #0x0]\n" + "b 248f\n" + "244:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x16, #2, 246f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x16, #1, 245f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v8.h }[6], [x13]\n" + "st1 { v12.h }[6], [x9]\n" + "st1 { v16.h }[6], [x27]\n" + "st1 { v20.h }[6], [x25]\n" + "st1 { v24.h }[6], [x23]\n" + "b 248f\n" + "245:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x16, #0, 248f\n" + "st1 { v8.h }[4], [x13]\n" + "st1 { v12.h }[4], [x9]\n" + "st1 { v16.h }[4], [x27]\n" + "st1 { v20.h }[4], [x25]\n" + "st1 { v24.h }[4], [x23]\n" + "b 248f\n" + "246:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x16, #1, 247f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "tbz x16, #0, 248f\n" + "st1 { v8.h }[2], [x13]\n" + "st1 { v12.h }[2], [x9]\n" + "st1 { v16.h }[2], [x27]\n" + "st1 { v20.h }[2], [x25]\n" + "st1 { v24.h }[2], [x23]\n" + "b 248f\n" + "247:" // Height 5: Partial direct writeback: partial_1_0 + "str h8, [x13, #0x0]\n" + "str h12, [x9, #0x0]\n" + "str h16, [x27, #0x0]\n" + "str h20, [x25, #0x0]\n" + "str h24, [x23, #0x0]\n" + "248:" // Height 5: Partial direct writeback: Done + "b 250f\n" + "249:" // Height 5: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "250:" // Height 5: Writeback done + "subs x16, x16, #0x20\n" + "bgt 203b\n" + "b 302f\n" + "251:" // Height 6 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 252f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #1\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "b 253f\n" + "252:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "add x25, x27, x19, LSL #1\n" + "add x23, x25, x19, LSL #1\n" + "add x21, x23, x19, LSL #1\n" + "add %x[output_ptr], x21, x19, LSL #1\n" + "253:" // Height 6: Column loop + "cbz x14, 254f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v24.16b, v8.16b\n" + "add x14, x14, #0x40\n" + "mov v28.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "mov v29.16b, v9.16b\n" + "mov v30.16b, v10.16b\n" + "mov v31.16b, v11.16b\n" + "b 273f\n" + "254:" // Height 6: no bias + "tbz %x[flags], #0, 272f\n" + "cmp x16, #0x20\n" + "bge 271f\n" + "tbz x16, #4, 262f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "ld1 { v20.8h }, [x25], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "ld1 { v28.8h }, [x21], #0x10\n" + "ld1 { v9.8h }, [x13], #0x10\n" + "ld1 { v13.8h }, [x9], #0x10\n" + "ld1 { v17.8h }, [x27], #0x10\n" + "ld1 { v21.8h }, [x25], #0x10\n" + "ld1 { v25.8h }, [x23], #0x10\n" + "ld1 { v29.8h }, [x21], #0x10\n" + "tbz x16, #3, 258f\n" + "ld1 { v10.8h }, [x13], #0x10\n" + "ld1 { v14.8h }, [x9], #0x10\n" + "ld1 { v18.8h }, [x27], #0x10\n" + "ld1 { v22.8h }, [x25], #0x10\n" + "ld1 { v26.8h }, [x23], #0x10\n" + "ld1 { v30.8h }, [x21], #0x10\n" + "tbz x16, #2, 256f\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x16, #1, 255f\n" + "ld1 { v11.s }[2], [x13], #0x4\n" + "ld1 { v15.s }[2], [x9], #0x4\n" + "ld1 { v19.s }[2], [x27], #0x4\n" + "ld1 { v23.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v31.s }[2], [x21], #0x4\n" + "mov x19, #0x3c\n" + "tbz x16, #0, 270f\n" + "ld1 { v11.h }[6], [x13]\n" + "ld1 { v15.h }[6], [x9]\n" + "ld1 { v19.h }[6], [x27]\n" + "ld1 { v23.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x23]\n" + "ld1 { v31.h }[6], [x21]\n" + "b 270f\n" + "255:" // Height 6: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x16, #0, 270f\n" + "ld1 { v11.h }[4], [x13]\n" + "ld1 { v15.h }[4], [x9]\n" + "ld1 { v19.h }[4], [x27]\n" + "ld1 { v23.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x23]\n" + "ld1 { v31.h }[4], [x21]\n" + "b 270f\n" + "256:" // Height 6: Partial accumulate: partial_2_24 + "tbz x16, #1, 257f\n" + "ldr s11, [x13], #0x4\n" + "ldr s15, [x9], #0x4\n" + "ldr s19, [x27], #0x4\n" + "ldr s23, [x25], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s31, [x21], #0x4\n" + "mov x19, #0x34\n" + "tbz x16, #0, 270f\n" + "ld1 { v11.h }[2], [x13]\n" + "ld1 { v15.h }[2], [x9]\n" + "ld1 { v19.h }[2], [x27]\n" + "ld1 { v23.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x23]\n" + "ld1 { v31.h }[2], [x21]\n" + "b 270f\n" + "257:" // Height 6: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x16, #0, 270f\n" + "ldr h11, [x13, #0x0]\n" + "ldr h15, [x9, #0x0]\n" + "ldr h19, [x27, #0x0]\n" + "ldr h23, [x25, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "ldr h31, [x21, #0x0]\n" + "b 270f\n" + "258:" // Height 6: Partial accumulate: partial_4_16 + "tbz x16, #2, 260f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d30, [x21], #0x8\n" + "tbz x16, #1, 259f\n" + "ld1 { v10.s }[2], [x13], #0x4\n" + "ld1 { v14.s }[2], [x9], #0x4\n" + "ld1 { v18.s }[2], [x27], #0x4\n" + "ld1 { v22.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v30.s }[2], [x21], #0x4\n" + "mov x19, #0x2c\n" + "tbz x16, #0, 270f\n" + "ld1 { v10.h }[6], [x13]\n" + "ld1 { v14.h }[6], [x9]\n" + "ld1 { v18.h }[6], [x27]\n" + "ld1 { v22.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x23]\n" + "ld1 { v30.h }[6], [x21]\n" + "b 270f\n" + "259:" // Height 6: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x16, #0, 270f\n" + "ld1 { v10.h }[4], [x13]\n" + "ld1 { v14.h }[4], [x9]\n" + "ld1 { v18.h }[4], [x27]\n" + "ld1 { v22.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x23]\n" + "ld1 { v30.h }[4], [x21]\n" + "b 270f\n" + "260:" // Height 6: Partial accumulate: partial_2_16 + "tbz x16, #1, 261f\n" + "ldr s10, [x13], #0x4\n" + "ldr s14, [x9], #0x4\n" + "ldr s18, [x27], #0x4\n" + "ldr s22, [x25], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s30, [x21], #0x4\n" + "mov x19, #0x24\n" + "tbz x16, #0, 270f\n" + "ld1 { v10.h }[2], [x13]\n" + "ld1 { v14.h }[2], [x9]\n" + "ld1 { v18.h }[2], [x27]\n" + "ld1 { v22.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x23]\n" + "ld1 { v30.h }[2], [x21]\n" + "b 270f\n" + "261:" // Height 6: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x16, #0, 270f\n" + "ldr h10, [x13, #0x0]\n" + "ldr h14, [x9, #0x0]\n" + "ldr h18, [x27, #0x0]\n" + "ldr h22, [x25, #0x0]\n" + "ldr h26, [x23, #0x0]\n" + "ldr h30, [x21, #0x0]\n" + "b 270f\n" + "262:" // Height 6: Partial accumulate: partial_8_0 + "tbz x16, #3, 266f\n" + "ld1 { v8.8h }, [x13], #0x10\n" + "ld1 { v12.8h }, [x9], #0x10\n" + "ld1 { v16.8h }, [x27], #0x10\n" + "ld1 { v20.8h }, [x25], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "ld1 { v28.8h }, [x21], #0x10\n" + "tbz x16, #2, 264f\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x16, #1, 263f\n" + "ld1 { v9.s }[2], [x13], #0x4\n" + "ld1 { v13.s }[2], [x9], #0x4\n" + "ld1 { v17.s }[2], [x27], #0x4\n" + "ld1 { v21.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v29.s }[2], [x21], #0x4\n" + "mov x19, #0x1c\n" + "tbz x16, #0, 270f\n" + "ld1 { v9.h }[6], [x13]\n" + "ld1 { v13.h }[6], [x9]\n" + "ld1 { v17.h }[6], [x27]\n" + "ld1 { v21.h }[6], [x25]\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v29.h }[6], [x21]\n" + "b 270f\n" + "263:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x16, #0, 270f\n" + "ld1 { v9.h }[4], [x13]\n" + "ld1 { v13.h }[4], [x9]\n" + "ld1 { v17.h }[4], [x27]\n" + "ld1 { v21.h }[4], [x25]\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v29.h }[4], [x21]\n" + "b 270f\n" + "264:" // Height 6: Partial accumulate: partial_2_8 + "tbz x16, #1, 265f\n" + "ldr s9, [x13], #0x4\n" + "ldr s13, [x9], #0x4\n" + "ldr s17, [x27], #0x4\n" + "ldr s21, [x25], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s29, [x21], #0x4\n" + "mov x19, #0x14\n" + "tbz x16, #0, 270f\n" + "ld1 { v9.h }[2], [x13]\n" + "ld1 { v13.h }[2], [x9]\n" + "ld1 { v17.h }[2], [x27]\n" + "ld1 { v21.h }[2], [x25]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v29.h }[2], [x21]\n" + "b 270f\n" + "265:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x16, #0, 270f\n" + "ldr h9, [x13, #0x0]\n" + "ldr h13, [x9, #0x0]\n" + "ldr h17, [x27, #0x0]\n" + "ldr h21, [x25, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h29, [x21, #0x0]\n" + "b 270f\n" + "266:" // Height 6: Partial accumulate: partial_4_0 + "tbz x16, #2, 268f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d28, [x21], #0x8\n" + "tbz x16, #1, 267f\n" + "ld1 { v8.s }[2], [x13], #0x4\n" + "ld1 { v12.s }[2], [x9], #0x4\n" + "ld1 { v16.s }[2], [x27], #0x4\n" + "ld1 { v20.s }[2], [x25], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v28.s }[2], [x21], #0x4\n" + "mov x19, #0xc\n" + "tbz x16, #0, 270f\n" + "ld1 { v8.h }[6], [x13]\n" + "ld1 { v12.h }[6], [x9]\n" + "ld1 { v16.h }[6], [x27]\n" + "ld1 { v20.h }[6], [x25]\n" + "ld1 { v24.h }[6], [x23]\n" + "ld1 { v28.h }[6], [x21]\n" + "b 270f\n" + "267:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x16, #0, 270f\n" + "ld1 { v8.h }[4], [x13]\n" + "ld1 { v12.h }[4], [x9]\n" + "ld1 { v16.h }[4], [x27]\n" + "ld1 { v20.h }[4], [x25]\n" + "ld1 { v24.h }[4], [x23]\n" + "ld1 { v28.h }[4], [x21]\n" + "b 270f\n" + "268:" // Height 6: Partial accumulate: partial_2_0 + "tbz x16, #1, 269f\n" + "ldr s8, [x13], #0x4\n" + "ldr s12, [x9], #0x4\n" + "ldr s16, [x27], #0x4\n" + "ldr s20, [x25], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s28, [x21], #0x4\n" + "mov x19, #0x4\n" + "tbz x16, #0, 270f\n" + "ld1 { v8.h }[2], [x13]\n" + "ld1 { v12.h }[2], [x9]\n" + "ld1 { v16.h }[2], [x27]\n" + "ld1 { v20.h }[2], [x25]\n" + "ld1 { v24.h }[2], [x23]\n" + "ld1 { v28.h }[2], [x21]\n" + "b 270f\n" + "269:" // Height 6: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr h8, [x13, #0x0]\n" + "ldr h12, [x9, #0x0]\n" + "ldr h16, [x27, #0x0]\n" + "ldr h20, [x25, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "ldr h28, [x21, #0x0]\n" + "270:" // Height 6: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "sub x21, x21, x19\n" + "b 273f\n" + "271:" // Height 6: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "ldr q28, [x21, #0x0]\n" + "ldr q29, [x21, #0x10]\n" + "ldr q30, [x21, #0x20]\n" + "ldr q31, [x21, #0x30]\n" + "b 273f\n" + "272:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "273:" // Height 6: setup done + "mov x12, #0x0\n" + "274:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 275f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 276f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 276f\n" + "275:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "add x20, x22, x19, LSL #1\n" + "276:" // Height 6: input setup done + "cmp x11, #0x8\n" + "blt 279f\n" + "cmp x11, #0x10\n" + "blt 278f\n" + "277:" // Height 6: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x20, x20, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x11, x11, #0x8\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "cmp x11, #0x10\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "fmla v28.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "fmla v29.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "fmla v30.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "fmla v31.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "fmla v28.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "fmla v29.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "fmla v30.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "fmla v31.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "fmla v28.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "fmla v29.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "fmla v30.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "fmla v31.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "fmla v28.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "fmla v29.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "fmla v30.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "fmla v31.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "fmla v28.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "fmla v29.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "fmla v30.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "fmla v31.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "fmla v28.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "fmla v29.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "fmla v30.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "fmla v31.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "fmla v28.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "fmla v29.8h, v7.8h, v5.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x15, x15, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "bge 277b\n" + "278:" // Height 6: Multiply loop: Single iteration only + "sub x11, x11, #0x8\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x20, x20, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "fmla v28.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "fmla v29.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "fmla v30.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "fmla v31.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "fmla v28.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "fmla v29.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "fmla v30.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "fmla v31.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "fmla v28.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "fmla v29.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "fmla v30.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x15, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "fmla v31.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x15, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "fmla v28.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x15, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "fmla v29.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x15, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "fmla v30.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x15, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "fmla v31.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x15, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "fmla v28.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x15, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "fmla v29.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x15, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "fmla v30.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x15, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "fmla v31.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x15, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "fmla v28.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x15, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "fmla v29.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x15, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "fmla v30.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x15, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "fmla v31.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x15, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "fmla v28.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x15, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "fmla v29.8h, v7.8h, v5.h[7]\n" + "ldr q7, [x15, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x15, x15, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "279:" // Height 6: Multiply loop: Main loop skip + "cbz x11, 281f\n" + "280:" // Height 6: Multiply loop: Odd block loop + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "cbnz x11, 280b\n" + "281:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 274b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 282f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v0.8h\n" + "fmin v22.8h, v22.8h, v0.8h\n" + "fmin v23.8h, v23.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v22.8h, v22.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v0.8h\n" + "fmin v25.8h, v25.8h, v0.8h\n" + "fmin v26.8h, v26.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v1.8h\n" + "fmax v25.8h, v25.8h, v1.8h\n" + "fmax v26.8h, v26.8h, v1.8h\n" + "fmin v27.8h, v27.8h, v0.8h\n" + "fmin v28.8h, v28.8h, v0.8h\n" + "fmin v29.8h, v29.8h, v0.8h\n" + "fmax v27.8h, v27.8h, v1.8h\n" + "fmax v28.8h, v28.8h, v1.8h\n" + "fmax v29.8h, v29.8h, v1.8h\n" + "fmin v30.8h, v30.8h, v0.8h\n" + "fmin v31.8h, v31.8h, v0.8h\n" + "fmax v30.8h, v30.8h, v1.8h\n" + "fmax v31.8h, v31.8h, v1.8h\n" + "282:" // Height 6: No activation + "cmp x16, #0x20\n" + "bge 299f\n" + "tbz x16, #4, 290f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v9.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v13.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "st1 { v17.8h }, [x27], #0x10\n" + "st1 { v20.8h }, [x25], #0x10\n" + "st1 { v21.8h }, [x25], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "st1 { v25.8h }, [x23], #0x10\n" + "st1 { v28.8h }, [x21], #0x10\n" + "st1 { v29.8h }, [x21], #0x10\n" + "tbz x16, #3, 286f\n" + "st1 { v10.8h }, [x13], #0x10\n" + "st1 { v14.8h }, [x9], #0x10\n" + "st1 { v18.8h }, [x27], #0x10\n" + "st1 { v22.8h }, [x25], #0x10\n" + "st1 { v26.8h }, [x23], #0x10\n" + "st1 { v30.8h }, [x21], #0x10\n" + "tbz x16, #2, 284f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x16, #1, 283f\n" + "st1 { v11.s }[2], [x13], #0x4\n" + "st1 { v15.s }[2], [x9], #0x4\n" + "st1 { v19.s }[2], [x27], #0x4\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "st1 { v27.s }[2], [x23], #0x4\n" + "st1 { v31.s }[2], [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v11.h }[6], [x13]\n" + "st1 { v15.h }[6], [x9]\n" + "st1 { v19.h }[6], [x27]\n" + "st1 { v23.h }[6], [x25]\n" + "st1 { v27.h }[6], [x23]\n" + "st1 { v31.h }[6], [x21]\n" + "b 298f\n" + "283:" // Height 6: Partial direct writeback: partial_1_28 + "tbz x16, #0, 298f\n" + "st1 { v11.h }[4], [x13]\n" + "st1 { v15.h }[4], [x9]\n" + "st1 { v19.h }[4], [x27]\n" + "st1 { v23.h }[4], [x25]\n" + "st1 { v27.h }[4], [x23]\n" + "st1 { v31.h }[4], [x21]\n" + "b 298f\n" + "284:" // Height 6: Partial direct writeback: partial_2_24 + "tbz x16, #1, 285f\n" + "str s11, [x13], #0x4\n" + "str s15, [x9], #0x4\n" + "str s19, [x27], #0x4\n" + "str s23, [x25], #0x4\n" + "str s27, [x23], #0x4\n" + "str s31, [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v11.h }[2], [x13]\n" + "st1 { v15.h }[2], [x9]\n" + "st1 { v19.h }[2], [x27]\n" + "st1 { v23.h }[2], [x25]\n" + "st1 { v27.h }[2], [x23]\n" + "st1 { v31.h }[2], [x21]\n" + "b 298f\n" + "285:" // Height 6: Partial direct writeback: partial_1_24 + "tbz x16, #0, 298f\n" + "str h11, [x13, #0x0]\n" + "str h15, [x9, #0x0]\n" + "str h19, [x27, #0x0]\n" + "str h23, [x25, #0x0]\n" + "str h27, [x23, #0x0]\n" + "str h31, [x21, #0x0]\n" + "b 298f\n" + "286:" // Height 6: Partial direct writeback: partial_4_16 + "tbz x16, #2, 288f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x16, #1, 287f\n" + "st1 { v10.s }[2], [x13], #0x4\n" + "st1 { v14.s }[2], [x9], #0x4\n" + "st1 { v18.s }[2], [x27], #0x4\n" + "st1 { v22.s }[2], [x25], #0x4\n" + "st1 { v26.s }[2], [x23], #0x4\n" + "st1 { v30.s }[2], [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v10.h }[6], [x13]\n" + "st1 { v14.h }[6], [x9]\n" + "st1 { v18.h }[6], [x27]\n" + "st1 { v22.h }[6], [x25]\n" + "st1 { v26.h }[6], [x23]\n" + "st1 { v30.h }[6], [x21]\n" + "b 298f\n" + "287:" // Height 6: Partial direct writeback: partial_1_20 + "tbz x16, #0, 298f\n" + "st1 { v10.h }[4], [x13]\n" + "st1 { v14.h }[4], [x9]\n" + "st1 { v18.h }[4], [x27]\n" + "st1 { v22.h }[4], [x25]\n" + "st1 { v26.h }[4], [x23]\n" + "st1 { v30.h }[4], [x21]\n" + "b 298f\n" + "288:" // Height 6: Partial direct writeback: partial_2_16 + "tbz x16, #1, 289f\n" + "str s10, [x13], #0x4\n" + "str s14, [x9], #0x4\n" + "str s18, [x27], #0x4\n" + "str s22, [x25], #0x4\n" + "str s26, [x23], #0x4\n" + "str s30, [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v10.h }[2], [x13]\n" + "st1 { v14.h }[2], [x9]\n" + "st1 { v18.h }[2], [x27]\n" + "st1 { v22.h }[2], [x25]\n" + "st1 { v26.h }[2], [x23]\n" + "st1 { v30.h }[2], [x21]\n" + "b 298f\n" + "289:" // Height 6: Partial direct writeback: partial_1_16 + "tbz x16, #0, 298f\n" + "str h10, [x13, #0x0]\n" + "str h14, [x9, #0x0]\n" + "str h18, [x27, #0x0]\n" + "str h22, [x25, #0x0]\n" + "str h26, [x23, #0x0]\n" + "str h30, [x21, #0x0]\n" + "b 298f\n" + "290:" // Height 6: Partial direct writeback: partial_8_0 + "tbz x16, #3, 294f\n" + "st1 { v8.8h }, [x13], #0x10\n" + "st1 { v12.8h }, [x9], #0x10\n" + "st1 { v16.8h }, [x27], #0x10\n" + "st1 { v20.8h }, [x25], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "st1 { v28.8h }, [x21], #0x10\n" + "tbz x16, #2, 292f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x16, #1, 291f\n" + "st1 { v9.s }[2], [x13], #0x4\n" + "st1 { v13.s }[2], [x9], #0x4\n" + "st1 { v17.s }[2], [x27], #0x4\n" + "st1 { v21.s }[2], [x25], #0x4\n" + "st1 { v25.s }[2], [x23], #0x4\n" + "st1 { v29.s }[2], [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v9.h }[6], [x13]\n" + "st1 { v13.h }[6], [x9]\n" + "st1 { v17.h }[6], [x27]\n" + "st1 { v21.h }[6], [x25]\n" + "st1 { v25.h }[6], [x23]\n" + "st1 { v29.h }[6], [x21]\n" + "b 298f\n" + "291:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x16, #0, 298f\n" + "st1 { v9.h }[4], [x13]\n" + "st1 { v13.h }[4], [x9]\n" + "st1 { v17.h }[4], [x27]\n" + "st1 { v21.h }[4], [x25]\n" + "st1 { v25.h }[4], [x23]\n" + "st1 { v29.h }[4], [x21]\n" + "b 298f\n" + "292:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x16, #1, 293f\n" + "str s9, [x13], #0x4\n" + "str s13, [x9], #0x4\n" + "str s17, [x27], #0x4\n" + "str s21, [x25], #0x4\n" + "str s25, [x23], #0x4\n" + "str s29, [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v9.h }[2], [x13]\n" + "st1 { v13.h }[2], [x9]\n" + "st1 { v17.h }[2], [x27]\n" + "st1 { v21.h }[2], [x25]\n" + "st1 { v25.h }[2], [x23]\n" + "st1 { v29.h }[2], [x21]\n" + "b 298f\n" + "293:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x16, #0, 298f\n" + "str h9, [x13, #0x0]\n" + "str h13, [x9, #0x0]\n" + "str h17, [x27, #0x0]\n" + "str h21, [x25, #0x0]\n" + "str h25, [x23, #0x0]\n" + "str h29, [x21, #0x0]\n" + "b 298f\n" + "294:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x16, #2, 296f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x16, #1, 295f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v8.h }[6], [x13]\n" + "st1 { v12.h }[6], [x9]\n" + "st1 { v16.h }[6], [x27]\n" + "st1 { v20.h }[6], [x25]\n" + "st1 { v24.h }[6], [x23]\n" + "st1 { v28.h }[6], [x21]\n" + "b 298f\n" + "295:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x16, #0, 298f\n" + "st1 { v8.h }[4], [x13]\n" + "st1 { v12.h }[4], [x9]\n" + "st1 { v16.h }[4], [x27]\n" + "st1 { v20.h }[4], [x25]\n" + "st1 { v24.h }[4], [x23]\n" + "st1 { v28.h }[4], [x21]\n" + "b 298f\n" + "296:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x16, #1, 297f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "str s28, [x21], #0x4\n" + "tbz x16, #0, 298f\n" + "st1 { v8.h }[2], [x13]\n" + "st1 { v12.h }[2], [x9]\n" + "st1 { v16.h }[2], [x27]\n" + "st1 { v20.h }[2], [x25]\n" + "st1 { v24.h }[2], [x23]\n" + "st1 { v28.h }[2], [x21]\n" + "b 298f\n" + "297:" // Height 6: Partial direct writeback: partial_1_0 + "str h8, [x13, #0x0]\n" + "str h12, [x9, #0x0]\n" + "str h16, [x27, #0x0]\n" + "str h20, [x25, #0x0]\n" + "str h24, [x23, #0x0]\n" + "str h28, [x21, #0x0]\n" + "298:" // Height 6: Partial direct writeback: Done + "b 300f\n" + "299:" // Height 6: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "str q28, [x21, #0x0]\n" + "str q29, [x21, #0x10]\n" + "str q30, [x21, #0x20]\n" + "str q31, [x21, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "add x21, x21, #0x40\n" + "300:" // Height 6: Writeback done + "subs x16, x16, #0x20\n" + "bgt 253b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 302f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 301f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "301:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "302:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp deleted file mode 100644 index 4147ab60dc..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../performance_parameters.hpp" -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); -void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); -void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); - -class hybrid_fp32_mla_16x4 -{ -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return 16; - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 2.866 }; - - case CPUModel::A53: - return { 1.419 }; - - case CPUModel::A73: - return { 2.551 }; - - default: - return { 6.25 }; - } - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_hybrid_fp32_mla_16x4; - - hybrid_fp32_mla_16x4(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::A55r1) { - kernel = a64_hybrid_fp32_mla_16x4_a55; - } else if (ci->get_cpu_model() == CPUModel::X1) { - kernel = a64_hybrid_fp32_mla_16x4_x1; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp deleted file mode 100644 index 94fcd1064e..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp +++ /dev/null @@ -1,2427 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = K; - const long loops_count = ((K + 4) / 8) - 1; - K -= loops_count * 8; - const long regs_count = (K / 4) - 1; - K -= (regs_count + 1) * 4; - const long blocks_count = K / 1; - float nullbias[16]; - if (!accumulate && !bias) { - memset(nullbias, 0, (16 * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "temploadreg0 .req X2\n" - "temploadreg1 .req X3\n" - "temploadreg2 .req X4\n" - "temploadreg3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v23.16b, v19.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr d4, [%[a_ptr0]]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr d5, [a_ptr1]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ins v4.d[1], temploadreg0\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ins v5.d[1], temploadreg1\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "subs %[loops], %[loops], #0x1\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "ins v15.d[1], temploadreg3\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "ldr d8, [%[b_ptr0]]\n" - "ins v0.d[1], temploadreg0\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "ins v1.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "ins v11.d[1], temploadreg3\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "b.ne 3b\n" - "2:\n" - "ins v14.d[1], temploadreg2\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr d4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr d5, [a_ptr1]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ins v4.d[1], temploadreg0\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ins v5.d[1], temploadreg1\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "temploadreg0 .req X4\n" - "temploadreg1 .req X5\n" - "temploadreg2 .req X6\n" - "temploadreg3 .req X7\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v23.16b, v19.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v24.16b, v16.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "mov v25.16b, v17.16b\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "mov v26.16b, v18.16b\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "mov v27.16b, v19.16b\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr d4, [%[a_ptr0]]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr d5, [a_ptr1]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr d6, [a_ptr2]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ins v4.d[1], temploadreg0\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ins v5.d[1], temploadreg1\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ins v6.d[1], temploadreg2\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "ins v9.d[1], temploadreg1\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "ins v0.d[1], temploadreg0\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ins v1.d[1], temploadreg1\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "ldr d2, [a_ptr2, #-0x10]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "ldr temploadreg2, [a_ptr2, #-0x8]\n" - "ins v9.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ins v11.d[1], temploadreg3\n" - "ins v2.d[1], temploadreg2\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "ins v11.d[1], temploadreg3\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "b.ne 3b\n" - "2:\n" - "ins v14.d[1], temploadreg2\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr d4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr d5, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr d6, [a_ptr2]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ins v4.d[1], temploadreg0\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ins v5.d[1], temploadreg1\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ins v6.d[1], temploadreg2\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "fmax v24.4s, v24.4s, v14.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "fmax v25.4s, v25.4s, v14.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "fmax v26.4s, v26.4s, v14.4s\n" - "str q20, [c_ptr1]\n" - "fmin v24.4s, v24.4s, v15.4s\n" - "fmin v25.4s, v25.4s, v15.4s\n" - "fmax v27.4s, v27.4s, v14.4s\n" - "str q21, [c_ptr1, #0x10]\n" - "fmin v26.4s, v26.4s, v15.4s\n" - "fmin v27.4s, v27.4s, v15.4s\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "temploadreg0 .req X6\n" - "temploadreg1 .req X7\n" - "temploadreg2 .req X8\n" - "temploadreg3 .req X9\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v23.16b, v19.16b\n" - "ldr q3, [a_ptr3]\n" - "mov v24.16b, v16.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v25.16b, v17.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "mov v26.16b, v18.16b\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "mov v27.16b, v19.16b\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "mov v28.16b, v16.16b\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "mov v29.16b, v17.16b\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "mov v30.16b, v18.16b\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "mov v31.16b, v19.16b\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ins v14.d[1], temploadreg2\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q28, [c_ptr3]\n" - "ldr q29, [c_ptr3, #0x10]\n" - "ldr q30, [c_ptr3, #0x20]\n" - "ldr q31, [c_ptr3, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "ins v14.d[1], temploadreg2\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr d4, [%[a_ptr0]]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr d5, [a_ptr1]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr d6, [a_ptr2]\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr d7, [a_ptr3]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr temploadreg3, [a_ptr3, #0x8]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ins v4.d[1], temploadreg0\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ins v5.d[1], temploadreg1\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ins v6.d[1], temploadreg2\n" - "fmla v28.4s, v12.4s, v3.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ins v7.d[1], temploadreg3\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v29.4s, v13.4s, v3.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v30.4s, v14.4s, v3.s[1]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v31.4s, v15.4s, v3.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v28.4s, v12.4s, v3.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "fmla v29.4s, v13.4s, v3.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v30.4s, v14.4s, v3.s[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v31.4s, v15.4s, v3.s[3]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - "fmla v28.4s, v8.4s, v7.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ins v0.d[1], temploadreg0\n" - "fmla v29.4s, v9.4s, v7.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - "fmla v30.4s, v10.4s, v7.s[0]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ins v1.d[1], temploadreg1\n" - "fmla v31.4s, v11.4s, v7.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v28.4s, v12.4s, v7.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "ldr d2, [a_ptr2, #-0x10]\n" - "fmla v29.4s, v13.4s, v7.s[1]\n" - "ldr temploadreg2, [a_ptr2, #-0x8]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "ins v2.d[1], temploadreg2\n" - "fmla v30.4s, v14.4s, v7.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v31.4s, v15.4s, v7.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "ldr d3, [a_ptr3, #-0x10]\n" - "fmla v28.4s, v8.4s, v7.s[2]\n" - "ldr temploadreg3, [a_ptr3, #-0x8]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "ins v3.d[1], temploadreg3\n" - "fmla v29.4s, v9.4s, v7.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v30.4s, v10.4s, v7.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v31.4s, v11.4s, v7.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "fmla v28.4s, v12.4s, v7.s[3]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v29.4s, v13.4s, v7.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "fmla v30.4s, v14.4s, v7.s[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v31.4s, v15.4s, v7.s[3]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "ins v14.d[1], temploadreg2\n" - "b.ne 3b\n" - "2:\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr d4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr d5, [a_ptr1]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr d6, [a_ptr2]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr d7, [a_ptr3]\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr temploadreg3, [a_ptr3, #0x8]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ins v4.d[1], temploadreg0\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ins v5.d[1], temploadreg1\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ins v6.d[1], temploadreg2\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ins v7.d[1], temploadreg3\n" - "fmla v28.4s, v12.4s, v3.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v29.4s, v13.4s, v3.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v30.4s, v14.4s, v3.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v31.4s, v15.4s, v3.s[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "fmla v28.4s, v12.4s, v3.s[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v29.4s, v13.4s, v3.s[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v30.4s, v14.4s, v3.s[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v31.4s, v15.4s, v3.s[3]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v28.4s, v8.4s, v7.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v29.4s, v9.4s, v7.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v30.4s, v10.4s, v7.s[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v31.4s, v11.4s, v7.s[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v28.4s, v12.4s, v7.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v29.4s, v13.4s, v7.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v30.4s, v14.4s, v7.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v31.4s, v15.4s, v7.s[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "fmla v28.4s, v8.4s, v7.s[2]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "fmla v29.4s, v9.4s, v7.s[2]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "fmla v30.4s, v10.4s, v7.s[2]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "fmla v31.4s, v11.4s, v7.s[2]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "fmla v28.4s, v12.4s, v7.s[3]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "fmla v29.4s, v13.4s, v7.s[3]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "fmla v30.4s, v14.4s, v7.s[3]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "fmla v31.4s, v15.4s, v7.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ins v8.d[1], temploadreg0\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ins v9.d[1], temploadreg1\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ins v10.d[1], temploadreg2\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ins v11.d[1], temploadreg3\n" - "fmla v28.4s, v12.4s, v3.s[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ins v12.d[1], temploadreg0\n" - "fmla v29.4s, v13.4s, v3.s[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ins v13.d[1], temploadreg1\n" - "fmla v30.4s, v14.4s, v3.s[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ins v14.d[1], temploadreg2\n" - "fmla v31.4s, v15.4s, v3.s[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ins v15.d[1], temploadreg3\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v28.4s, v12.4s, v3.s[3]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v29.4s, v13.4s, v3.s[3]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "fmla v30.4s, v14.4s, v3.s[3]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "fmla v31.4s, v15.4s, v3.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr s3, [a_ptr3]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "fmax v24.4s, v24.4s, v14.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "fmax v25.4s, v25.4s, v14.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "fmax v26.4s, v26.4s, v14.4s\n" - "str q20, [c_ptr1]\n" - "fmin v24.4s, v24.4s, v15.4s\n" - "fmin v25.4s, v25.4s, v15.4s\n" - "fmax v27.4s, v27.4s, v14.4s\n" - "str q21, [c_ptr1, #0x10]\n" - "fmin v26.4s, v26.4s, v15.4s\n" - "fmax v28.4s, v28.4s, v14.4s\n" - "fmax v29.4s, v29.4s, v14.4s\n" - "str q22, [c_ptr1, #0x20]\n" - "fmin v27.4s, v27.4s, v15.4s\n" - "fmax v30.4s, v30.4s, v14.4s\n" - "fmin v28.4s, v28.4s, v15.4s\n" - "str q23, [c_ptr1, #0x30]\n" - "fmin v29.4s, v29.4s, v15.4s\n" - "fmax v31.4s, v31.4s, v14.4s\n" - "fmin v30.4s, v30.4s, v15.4s\n" - "str q24, [c_ptr2]\n" - "fmin v31.4s, v31.4s, v15.4s\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - "str q28, [c_ptr3]\n" - "str q29, [c_ptr3, #0x10]\n" - "str q30, [c_ptr3, #0x20]\n" - "str q31, [c_ptr3, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy - -#include "arm_gemm.hpp" - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = K; - const long loops_count = ((K + 4) / 8) - 1; - K -= loops_count * 8; - const long regs_count = (K / 4) - 1; - K -= (regs_count + 1) * 4; - const long blocks_count = K / 1; - float nullbias[16]; - if (!accumulate && !bias) { - memset(nullbias, 0, (16 * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v23.16b, v19.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v23.16b, v19.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v24.16b, v16.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "mov v25.16b, v17.16b\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "mov v26.16b, v18.16b\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "mov v27.16b, v19.16b\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "fmax v24.4s, v24.4s, v14.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "fmax v25.4s, v25.4s, v14.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "fmax v26.4s, v26.4s, v14.4s\n" - "str q20, [c_ptr1]\n" - "fmin v24.4s, v24.4s, v15.4s\n" - "fmin v25.4s, v25.4s, v15.4s\n" - "fmax v27.4s, v27.4s, v14.4s\n" - "str q21, [c_ptr1, #0x10]\n" - "fmin v26.4s, v26.4s, v15.4s\n" - "fmin v27.4s, v27.4s, v15.4s\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v23.16b, v19.16b\n" - "ldr q3, [a_ptr3]\n" - "mov v24.16b, v16.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v25.16b, v17.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "mov v26.16b, v18.16b\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "mov v27.16b, v19.16b\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "mov v28.16b, v16.16b\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "mov v29.16b, v17.16b\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "mov v30.16b, v18.16b\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "mov v31.16b, v19.16b\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q28, [c_ptr3]\n" - "ldr q29, [c_ptr3, #0x10]\n" - "ldr q30, [c_ptr3, #0x20]\n" - "ldr q31, [c_ptr3, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q7, [a_ptr3]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "fmla v28.4s, v12.4s, v3.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "fmla v29.4s, v13.4s, v3.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "fmla v30.4s, v14.4s, v3.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "fmla v31.4s, v15.4s, v3.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v28.4s, v12.4s, v3.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v29.4s, v13.4s, v3.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "fmla v30.4s, v14.4s, v3.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v31.4s, v15.4s, v3.s[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "fmla v28.4s, v8.4s, v7.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "fmla v29.4s, v9.4s, v7.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "fmla v30.4s, v10.4s, v7.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "fmla v31.4s, v11.4s, v7.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "fmla v28.4s, v12.4s, v7.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "fmla v29.4s, v13.4s, v7.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "fmla v30.4s, v14.4s, v7.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "fmla v31.4s, v15.4s, v7.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "fmla v28.4s, v8.4s, v7.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "fmla v29.4s, v9.4s, v7.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "fmla v30.4s, v10.4s, v7.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "fmla v31.4s, v11.4s, v7.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "fmla v28.4s, v12.4s, v7.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "fmla v29.4s, v13.4s, v7.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "fmla v30.4s, v14.4s, v7.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "fmla v31.4s, v15.4s, v7.s[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr q7, [a_ptr3]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "fmla v28.4s, v12.4s, v3.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "fmla v29.4s, v13.4s, v3.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "fmla v30.4s, v14.4s, v3.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "fmla v31.4s, v15.4s, v3.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v28.4s, v12.4s, v3.s[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v29.4s, v13.4s, v3.s[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "fmla v30.4s, v14.4s, v3.s[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "fmla v31.4s, v15.4s, v3.s[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "fmla v28.4s, v8.4s, v7.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "fmla v29.4s, v9.4s, v7.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "fmla v30.4s, v10.4s, v7.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "fmla v31.4s, v11.4s, v7.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v4.s[1]\n" - "fmla v20.4s, v12.4s, v5.s[1]\n" - "fmla v24.4s, v12.4s, v6.s[1]\n" - "fmla v28.4s, v12.4s, v7.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v4.s[1]\n" - "fmla v21.4s, v13.4s, v5.s[1]\n" - "fmla v25.4s, v13.4s, v6.s[1]\n" - "fmla v29.4s, v13.4s, v7.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v4.s[1]\n" - "fmla v22.4s, v14.4s, v5.s[1]\n" - "fmla v26.4s, v14.4s, v6.s[1]\n" - "fmla v30.4s, v14.4s, v7.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v4.s[1]\n" - "fmla v23.4s, v15.4s, v5.s[1]\n" - "fmla v27.4s, v15.4s, v6.s[1]\n" - "fmla v31.4s, v15.4s, v7.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "fmla v28.4s, v8.4s, v7.s[2]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "fmla v29.4s, v9.4s, v7.s[2]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "fmla v30.4s, v10.4s, v7.s[2]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "fmla v31.4s, v11.4s, v7.s[2]\n" - "fmla v16.4s, v12.4s, v4.s[3]\n" - "fmla v20.4s, v12.4s, v5.s[3]\n" - "fmla v24.4s, v12.4s, v6.s[3]\n" - "fmla v28.4s, v12.4s, v7.s[3]\n" - "fmla v17.4s, v13.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v5.s[3]\n" - "fmla v25.4s, v13.4s, v6.s[3]\n" - "fmla v29.4s, v13.4s, v7.s[3]\n" - "fmla v18.4s, v14.4s, v4.s[3]\n" - "fmla v22.4s, v14.4s, v5.s[3]\n" - "fmla v26.4s, v14.4s, v6.s[3]\n" - "fmla v30.4s, v14.4s, v7.s[3]\n" - "fmla v19.4s, v15.4s, v4.s[3]\n" - "fmla v23.4s, v15.4s, v5.s[3]\n" - "fmla v27.4s, v15.4s, v6.s[3]\n" - "fmla v31.4s, v15.4s, v7.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v12.4s, v0.s[1]\n" - "fmla v20.4s, v12.4s, v1.s[1]\n" - "fmla v24.4s, v12.4s, v2.s[1]\n" - "fmla v28.4s, v12.4s, v3.s[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v13.4s, v0.s[1]\n" - "fmla v21.4s, v13.4s, v1.s[1]\n" - "fmla v25.4s, v13.4s, v2.s[1]\n" - "fmla v29.4s, v13.4s, v3.s[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v14.4s, v0.s[1]\n" - "fmla v22.4s, v14.4s, v1.s[1]\n" - "fmla v26.4s, v14.4s, v2.s[1]\n" - "fmla v30.4s, v14.4s, v3.s[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v15.4s, v0.s[1]\n" - "fmla v23.4s, v15.4s, v1.s[1]\n" - "fmla v27.4s, v15.4s, v2.s[1]\n" - "fmla v31.4s, v15.4s, v3.s[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "fmla v16.4s, v12.4s, v0.s[3]\n" - "fmla v20.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v28.4s, v12.4s, v3.s[3]\n" - "fmla v17.4s, v13.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v29.4s, v13.4s, v3.s[3]\n" - "fmla v18.4s, v14.4s, v0.s[3]\n" - "fmla v22.4s, v14.4s, v1.s[3]\n" - "fmla v26.4s, v14.4s, v2.s[3]\n" - "fmla v30.4s, v14.4s, v3.s[3]\n" - "fmla v19.4s, v15.4s, v0.s[3]\n" - "fmla v23.4s, v15.4s, v1.s[3]\n" - "fmla v27.4s, v15.4s, v2.s[3]\n" - "fmla v31.4s, v15.4s, v3.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr s3, [a_ptr3]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "fmax v24.4s, v24.4s, v14.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "fmax v25.4s, v25.4s, v14.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "fmax v26.4s, v26.4s, v14.4s\n" - "str q20, [c_ptr1]\n" - "fmin v24.4s, v24.4s, v15.4s\n" - "fmin v25.4s, v25.4s, v15.4s\n" - "fmax v27.4s, v27.4s, v14.4s\n" - "str q21, [c_ptr1, #0x10]\n" - "fmin v26.4s, v26.4s, v15.4s\n" - "fmax v28.4s, v28.4s, v14.4s\n" - "fmax v29.4s, v29.4s, v14.4s\n" - "str q22, [c_ptr1, #0x20]\n" - "fmin v27.4s, v27.4s, v15.4s\n" - "fmax v30.4s, v30.4s, v14.4s\n" - "fmin v28.4s, v28.4s, v15.4s\n" - "str q23, [c_ptr1, #0x30]\n" - "fmin v29.4s, v29.4s, v15.4s\n" - "fmax v31.4s, v31.4s, v14.4s\n" - "fmin v30.4s, v30.4s, v15.4s\n" - "str q24, [c_ptr2]\n" - "fmin v31.4s, v31.4s, v15.4s\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - "str q28, [c_ptr3]\n" - "str q29, [c_ptr3, #0x10]\n" - "str q30, [c_ptr3, #0x20]\n" - "str q31, [c_ptr3, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy - -#include "arm_gemm.hpp" - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = K; - const long loops_count = ((K + 4) / 8) - 1; - K -= loops_count * 8; - const long regs_count = (K / 4) - 1; - K -= (regs_count + 1) * 4; - const long blocks_count = K / 1; - float nullbias[16]; - if (!accumulate && !bias) { - memset(nullbias, 0, (16 * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v23.16b, v19.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v4.s[1]\n" - "fmla v20.4s, v8.4s, v5.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v4.s[1]\n" - "fmla v21.4s, v9.4s, v5.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v4.s[1]\n" - "fmla v22.4s, v10.4s, v5.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v4.s[1]\n" - "fmla v23.4s, v11.4s, v5.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v4.s[3]\n" - "fmla v20.4s, v8.4s, v5.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v4.s[3]\n" - "fmla v21.4s, v9.4s, v5.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v4.s[3]\n" - "fmla v22.4s, v10.4s, v5.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v4.s[3]\n" - "fmla v23.4s, v11.4s, v5.s[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v4.s[1]\n" - "fmla v20.4s, v8.4s, v5.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v4.s[1]\n" - "fmla v21.4s, v9.4s, v5.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v4.s[1]\n" - "fmla v22.4s, v10.4s, v5.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v4.s[1]\n" - "fmla v23.4s, v11.4s, v5.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v4.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #-0x40\n" - "fmla v20.4s, v8.4s, v5.s[3]\n" - "fmla v17.4s, v9.4s, v4.s[3]\n" - "fmla v21.4s, v9.4s, v5.s[3]\n" - "fmla v18.4s, v10.4s, v4.s[3]\n" - "fmla v22.4s, v10.4s, v5.s[3]\n" - "fmla v19.4s, v11.4s, v4.s[3]\n" - "fmla v23.4s, v11.4s, v5.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #-0x40\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v23.16b, v19.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v24.16b, v16.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "mov v25.16b, v17.16b\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "mov v26.16b, v18.16b\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "mov v27.16b, v19.16b\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "fmla v26.4s, v10.4s, v2.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "fmla v27.4s, v11.4s, v2.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "fmla v24.4s, v8.4s, v2.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "fmla v25.4s, v9.4s, v2.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "fmla v26.4s, v10.4s, v2.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v27.4s, v11.4s, v2.s[3]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v4.s[1]\n" - "fmla v20.4s, v8.4s, v5.s[1]\n" - "fmla v24.4s, v8.4s, v6.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v4.s[1]\n" - "fmla v21.4s, v9.4s, v5.s[1]\n" - "fmla v25.4s, v9.4s, v6.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v4.s[1]\n" - "fmla v22.4s, v10.4s, v5.s[1]\n" - "fmla v26.4s, v10.4s, v6.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v4.s[1]\n" - "fmla v23.4s, v11.4s, v5.s[1]\n" - "fmla v27.4s, v11.4s, v6.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v4.s[3]\n" - "fmla v20.4s, v8.4s, v5.s[3]\n" - "fmla v24.4s, v8.4s, v6.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v4.s[3]\n" - "fmla v21.4s, v9.4s, v5.s[3]\n" - "fmla v25.4s, v9.4s, v6.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v4.s[3]\n" - "fmla v22.4s, v10.4s, v5.s[3]\n" - "fmla v26.4s, v10.4s, v6.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v4.s[3]\n" - "fmla v23.4s, v11.4s, v5.s[3]\n" - "fmla v27.4s, v11.4s, v6.s[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "fmla v26.4s, v10.4s, v2.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "fmla v27.4s, v11.4s, v2.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "fmla v24.4s, v8.4s, v2.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "fmla v25.4s, v9.4s, v2.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "fmla v26.4s, v10.4s, v2.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "fmla v27.4s, v11.4s, v2.s[3]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v4.s[1]\n" - "fmla v20.4s, v8.4s, v5.s[1]\n" - "fmla v24.4s, v8.4s, v6.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v4.s[1]\n" - "fmla v21.4s, v9.4s, v5.s[1]\n" - "fmla v25.4s, v9.4s, v6.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v4.s[1]\n" - "fmla v22.4s, v10.4s, v5.s[1]\n" - "fmla v26.4s, v10.4s, v6.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v4.s[1]\n" - "fmla v23.4s, v11.4s, v5.s[1]\n" - "fmla v27.4s, v11.4s, v6.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v4.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #-0x40\n" - "fmla v20.4s, v8.4s, v5.s[3]\n" - "fmla v24.4s, v8.4s, v6.s[3]\n" - "fmla v17.4s, v9.4s, v4.s[3]\n" - "fmla v21.4s, v9.4s, v5.s[3]\n" - "fmla v25.4s, v9.4s, v6.s[3]\n" - "fmla v18.4s, v10.4s, v4.s[3]\n" - "fmla v22.4s, v10.4s, v5.s[3]\n" - "fmla v26.4s, v10.4s, v6.s[3]\n" - "fmla v19.4s, v11.4s, v4.s[3]\n" - "fmla v23.4s, v11.4s, v5.s[3]\n" - "fmla v27.4s, v11.4s, v6.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "fmla v26.4s, v10.4s, v2.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "fmla v27.4s, v11.4s, v2.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #-0x40\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "fmla v24.4s, v8.4s, v2.s[3]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "fmla v25.4s, v9.4s, v2.s[3]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "fmla v26.4s, v10.4s, v2.s[3]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "fmla v27.4s, v11.4s, v2.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "fmax v24.4s, v24.4s, v14.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "fmax v25.4s, v25.4s, v14.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "fmax v26.4s, v26.4s, v14.4s\n" - "str q20, [c_ptr1]\n" - "fmin v24.4s, v24.4s, v15.4s\n" - "fmin v25.4s, v25.4s, v15.4s\n" - "fmax v27.4s, v27.4s, v14.4s\n" - "str q21, [c_ptr1, #0x10]\n" - "fmin v26.4s, v26.4s, v15.4s\n" - "fmin v27.4s, v27.4s, v15.4s\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "ldr q16, [%[biasptr]]\n" - "ldr q17, [%[biasptr], #0x10]\n" - "ldr q18, [%[biasptr], #0x20]\n" - "ldr q19, [%[biasptr], #0x30]\n" - "mov v20.16b, v16.16b\n" - "ldr q0, [%[a_ptr0]]\n" - "mov v21.16b, v17.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v22.16b, v18.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v23.16b, v19.16b\n" - "ldr q3, [a_ptr3]\n" - "mov v24.16b, v16.16b\n" - "ldr q8, [%[b_ptr0]]\n" - "mov v25.16b, v17.16b\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "mov v26.16b, v18.16b\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "mov v27.16b, v19.16b\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "mov v28.16b, v16.16b\n" - "add a_ptr1, a_ptr1, #0x10\n" - "mov v29.16b, v17.16b\n" - "add a_ptr2, a_ptr2, #0x10\n" - "mov v30.16b, v18.16b\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov v31.16b, v19.16b\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q28, [c_ptr3]\n" - "ldr q29, [c_ptr3, #0x10]\n" - "ldr q30, [c_ptr3, #0x20]\n" - "ldr q31, [c_ptr3, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q7, [a_ptr3]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v28.4s, v8.4s, v3.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v29.4s, v9.4s, v3.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "fmla v26.4s, v10.4s, v2.s[1]\n" - "fmla v30.4s, v10.4s, v3.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "fmla v27.4s, v11.4s, v2.s[1]\n" - "fmla v31.4s, v11.4s, v3.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "fmla v24.4s, v8.4s, v2.s[3]\n" - "fmla v28.4s, v8.4s, v3.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "fmla v25.4s, v9.4s, v2.s[3]\n" - "fmla v29.4s, v9.4s, v3.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "fmla v26.4s, v10.4s, v2.s[3]\n" - "fmla v30.4s, v10.4s, v3.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v27.4s, v11.4s, v2.s[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v31.4s, v11.4s, v3.s[3]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "fmla v28.4s, v8.4s, v7.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "fmla v29.4s, v9.4s, v7.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "fmla v30.4s, v10.4s, v7.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "fmla v31.4s, v11.4s, v7.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v4.s[1]\n" - "fmla v20.4s, v8.4s, v5.s[1]\n" - "fmla v24.4s, v8.4s, v6.s[1]\n" - "fmla v28.4s, v8.4s, v7.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v4.s[1]\n" - "fmla v21.4s, v9.4s, v5.s[1]\n" - "fmla v25.4s, v9.4s, v6.s[1]\n" - "fmla v29.4s, v9.4s, v7.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v4.s[1]\n" - "fmla v22.4s, v10.4s, v5.s[1]\n" - "fmla v26.4s, v10.4s, v6.s[1]\n" - "fmla v30.4s, v10.4s, v7.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v4.s[1]\n" - "fmla v23.4s, v11.4s, v5.s[1]\n" - "fmla v27.4s, v11.4s, v6.s[1]\n" - "fmla v31.4s, v11.4s, v7.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "fmla v28.4s, v8.4s, v7.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "fmla v29.4s, v9.4s, v7.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "fmla v30.4s, v10.4s, v7.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "fmla v31.4s, v11.4s, v7.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v4.s[3]\n" - "fmla v20.4s, v8.4s, v5.s[3]\n" - "fmla v24.4s, v8.4s, v6.s[3]\n" - "fmla v28.4s, v8.4s, v7.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v4.s[3]\n" - "fmla v21.4s, v9.4s, v5.s[3]\n" - "fmla v25.4s, v9.4s, v6.s[3]\n" - "fmla v29.4s, v9.4s, v7.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v4.s[3]\n" - "fmla v22.4s, v10.4s, v5.s[3]\n" - "fmla v26.4s, v10.4s, v6.s[3]\n" - "fmla v30.4s, v10.4s, v7.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v4.s[3]\n" - "fmla v23.4s, v11.4s, v5.s[3]\n" - "fmla v27.4s, v11.4s, v6.s[3]\n" - "fmla v31.4s, v11.4s, v7.s[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "cbz %[regs], 4f\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr q4, [%[a_ptr0]]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "ldr q5, [a_ptr1]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "ldr q6, [a_ptr2]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr q7, [a_ptr3]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v28.4s, v8.4s, v3.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v29.4s, v9.4s, v3.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "fmla v26.4s, v10.4s, v2.s[1]\n" - "fmla v30.4s, v10.4s, v3.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "fmla v27.4s, v11.4s, v2.s[1]\n" - "fmla v31.4s, v11.4s, v3.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "fmla v24.4s, v8.4s, v2.s[3]\n" - "fmla v28.4s, v8.4s, v3.s[3]\n" - "ldr q8, [%[b_ptr0], #-0x40]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "fmla v25.4s, v9.4s, v2.s[3]\n" - "fmla v29.4s, v9.4s, v3.s[3]\n" - "ldr q9, [%[b_ptr0], #-0x30]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "fmla v26.4s, v10.4s, v2.s[3]\n" - "fmla v30.4s, v10.4s, v3.s[3]\n" - "ldr q10, [%[b_ptr0], #-0x20]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "fmla v27.4s, v11.4s, v2.s[3]\n" - "fmla v31.4s, v11.4s, v3.s[3]\n" - "ldr q11, [%[b_ptr0], #-0x10]\n" - "fmla v16.4s, v8.4s, v4.s[0]\n" - "fmla v20.4s, v8.4s, v5.s[0]\n" - "fmla v24.4s, v8.4s, v6.s[0]\n" - "fmla v28.4s, v8.4s, v7.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v4.s[0]\n" - "fmla v21.4s, v9.4s, v5.s[0]\n" - "fmla v25.4s, v9.4s, v6.s[0]\n" - "fmla v29.4s, v9.4s, v7.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v4.s[0]\n" - "fmla v22.4s, v10.4s, v5.s[0]\n" - "fmla v26.4s, v10.4s, v6.s[0]\n" - "fmla v30.4s, v10.4s, v7.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v4.s[0]\n" - "fmla v23.4s, v11.4s, v5.s[0]\n" - "fmla v27.4s, v11.4s, v6.s[0]\n" - "fmla v31.4s, v11.4s, v7.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v4.s[1]\n" - "fmla v20.4s, v8.4s, v5.s[1]\n" - "fmla v24.4s, v8.4s, v6.s[1]\n" - "fmla v28.4s, v8.4s, v7.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v4.s[1]\n" - "fmla v21.4s, v9.4s, v5.s[1]\n" - "fmla v25.4s, v9.4s, v6.s[1]\n" - "fmla v29.4s, v9.4s, v7.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v4.s[1]\n" - "fmla v22.4s, v10.4s, v5.s[1]\n" - "fmla v26.4s, v10.4s, v6.s[1]\n" - "fmla v30.4s, v10.4s, v7.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v4.s[1]\n" - "fmla v23.4s, v11.4s, v5.s[1]\n" - "fmla v27.4s, v11.4s, v6.s[1]\n" - "fmla v31.4s, v11.4s, v7.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v4.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v5.s[2]\n" - "fmla v24.4s, v8.4s, v6.s[2]\n" - "fmla v28.4s, v8.4s, v7.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v4.s[2]\n" - "fmla v21.4s, v9.4s, v5.s[2]\n" - "fmla v25.4s, v9.4s, v6.s[2]\n" - "fmla v29.4s, v9.4s, v7.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v4.s[2]\n" - "fmla v22.4s, v10.4s, v5.s[2]\n" - "fmla v26.4s, v10.4s, v6.s[2]\n" - "fmla v30.4s, v10.4s, v7.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v4.s[2]\n" - "fmla v23.4s, v11.4s, v5.s[2]\n" - "fmla v27.4s, v11.4s, v6.s[2]\n" - "fmla v31.4s, v11.4s, v7.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v4.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #-0x40\n" - "fmla v20.4s, v8.4s, v5.s[3]\n" - "fmla v24.4s, v8.4s, v6.s[3]\n" - "fmla v28.4s, v8.4s, v7.s[3]\n" - "fmla v17.4s, v9.4s, v4.s[3]\n" - "fmla v21.4s, v9.4s, v5.s[3]\n" - "fmla v25.4s, v9.4s, v6.s[3]\n" - "fmla v29.4s, v9.4s, v7.s[3]\n" - "fmla v18.4s, v10.4s, v4.s[3]\n" - "fmla v22.4s, v10.4s, v5.s[3]\n" - "fmla v26.4s, v10.4s, v6.s[3]\n" - "fmla v30.4s, v10.4s, v7.s[3]\n" - "fmla v19.4s, v11.4s, v4.s[3]\n" - "fmla v23.4s, v11.4s, v5.s[3]\n" - "fmla v27.4s, v11.4s, v6.s[3]\n" - "fmla v31.4s, v11.4s, v7.s[3]\n" - "b 5f\n" - "4:\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "ldr q8, [%[b_ptr0]]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "fmla v16.4s, v8.4s, v0.s[1]\n" - "fmla v20.4s, v8.4s, v1.s[1]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v28.4s, v8.4s, v3.s[1]\n" - "ldr q8, [%[b_ptr0], #0x40]\n" - "fmla v17.4s, v9.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v29.4s, v9.4s, v3.s[1]\n" - "ldr q9, [%[b_ptr0], #0x50]\n" - "fmla v18.4s, v10.4s, v0.s[1]\n" - "fmla v22.4s, v10.4s, v1.s[1]\n" - "fmla v26.4s, v10.4s, v2.s[1]\n" - "fmla v30.4s, v10.4s, v3.s[1]\n" - "ldr q10, [%[b_ptr0], #0x60]\n" - "fmla v19.4s, v11.4s, v0.s[1]\n" - "fmla v23.4s, v11.4s, v1.s[1]\n" - "fmla v27.4s, v11.4s, v2.s[1]\n" - "fmla v31.4s, v11.4s, v3.s[1]\n" - "ldr q11, [%[b_ptr0], #0x70]\n" - "fmla v16.4s, v8.4s, v0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - "fmla v20.4s, v8.4s, v1.s[2]\n" - "fmla v24.4s, v8.4s, v2.s[2]\n" - "fmla v28.4s, v8.4s, v3.s[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - "fmla v17.4s, v9.4s, v0.s[2]\n" - "fmla v21.4s, v9.4s, v1.s[2]\n" - "fmla v25.4s, v9.4s, v2.s[2]\n" - "fmla v29.4s, v9.4s, v3.s[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - "fmla v18.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v26.4s, v10.4s, v2.s[2]\n" - "fmla v30.4s, v10.4s, v3.s[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - "fmla v19.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v27.4s, v11.4s, v2.s[2]\n" - "fmla v31.4s, v11.4s, v3.s[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - "fmla v16.4s, v8.4s, v0.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #-0x40\n" - "fmla v20.4s, v8.4s, v1.s[3]\n" - "fmla v24.4s, v8.4s, v2.s[3]\n" - "fmla v28.4s, v8.4s, v3.s[3]\n" - "fmla v17.4s, v9.4s, v0.s[3]\n" - "fmla v21.4s, v9.4s, v1.s[3]\n" - "fmla v25.4s, v9.4s, v2.s[3]\n" - "fmla v29.4s, v9.4s, v3.s[3]\n" - "fmla v18.4s, v10.4s, v0.s[3]\n" - "fmla v22.4s, v10.4s, v1.s[3]\n" - "fmla v26.4s, v10.4s, v2.s[3]\n" - "fmla v30.4s, v10.4s, v3.s[3]\n" - "fmla v19.4s, v11.4s, v0.s[3]\n" - "fmla v23.4s, v11.4s, v1.s[3]\n" - "fmla v27.4s, v11.4s, v2.s[3]\n" - "fmla v31.4s, v11.4s, v3.s[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v16.4s, v8.4s, v0.s[0]\n" - "ldr s1, [a_ptr1]\n" - "fmla v17.4s, v9.4s, v0.s[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v18.4s, v10.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v20.4s, v8.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "fmla v21.4s, v9.4s, v1.s[0]\n" - "ldr s3, [a_ptr3]\n" - "fmla v24.4s, v8.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "fmla v25.4s, v9.4s, v2.s[0]\n" - "fmla v28.4s, v8.4s, v3.s[0]\n" - "fmla v29.4s, v9.4s, v3.s[0]\n" - "fmla v22.4s, v10.4s, v1.s[0]\n" - "fmla v26.4s, v10.4s, v2.s[0]\n" - "fmla v30.4s, v10.4s, v3.s[0]\n" - "fmla v19.4s, v11.4s, v0.s[0]\n" - "fmla v23.4s, v11.4s, v1.s[0]\n" - "fmla v27.4s, v11.4s, v2.s[0]\n" - "fmla v31.4s, v11.4s, v3.s[0]\n" - "b.ne 7b\n" - "6:\n" - "ld1r {v14.4s}, [%[minptr]]\n" - "ld1r {v15.4s}, [%[maxptr]]\n" - "fmax v16.4s, v16.4s, v14.4s\n" - "fmax v17.4s, v17.4s, v14.4s\n" - "fmax v18.4s, v18.4s, v14.4s\n" - "fmax v19.4s, v19.4s, v14.4s\n" - "fmin v16.4s, v16.4s, v15.4s\n" - "fmin v17.4s, v17.4s, v15.4s\n" - "fmin v18.4s, v18.4s, v15.4s\n" - "fmin v19.4s, v19.4s, v15.4s\n" - "str q16, [%[c_ptr0]]\n" - "fmax v20.4s, v20.4s, v14.4s\n" - "fmax v21.4s, v21.4s, v14.4s\n" - "fmax v22.4s, v22.4s, v14.4s\n" - "str q17, [%[c_ptr0], #0x10]\n" - "fmax v23.4s, v23.4s, v14.4s\n" - "fmin v20.4s, v20.4s, v15.4s\n" - "fmin v21.4s, v21.4s, v15.4s\n" - "str q18, [%[c_ptr0], #0x20]\n" - "fmin v22.4s, v22.4s, v15.4s\n" - "fmin v23.4s, v23.4s, v15.4s\n" - "fmax v24.4s, v24.4s, v14.4s\n" - "str q19, [%[c_ptr0], #0x30]\n" - "fmax v25.4s, v25.4s, v14.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "fmax v26.4s, v26.4s, v14.4s\n" - "str q20, [c_ptr1]\n" - "fmin v24.4s, v24.4s, v15.4s\n" - "fmin v25.4s, v25.4s, v15.4s\n" - "fmax v27.4s, v27.4s, v14.4s\n" - "str q21, [c_ptr1, #0x10]\n" - "fmin v26.4s, v26.4s, v15.4s\n" - "fmax v28.4s, v28.4s, v14.4s\n" - "fmax v29.4s, v29.4s, v14.4s\n" - "str q22, [c_ptr1, #0x20]\n" - "fmin v27.4s, v27.4s, v15.4s\n" - "fmax v30.4s, v30.4s, v14.4s\n" - "fmin v28.4s, v28.4s, v15.4s\n" - "str q23, [c_ptr1, #0x30]\n" - "fmin v29.4s, v29.4s, v15.4s\n" - "fmax v31.4s, v31.4s, v14.4s\n" - "fmin v30.4s, v30.4s, v15.4s\n" - "str q24, [c_ptr2]\n" - "fmin v31.4s, v31.4s, v15.4s\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - "str q28, [c_ptr3]\n" - "str q29, [c_ptr3, #0x10]\n" - "str q30, [c_ptr3, #0x20]\n" - "str q31, [c_ptr3, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_hybrid_fp32_mla_4x8; - - hybrid_fp32_mla_4x8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp deleted file mode 100644 index 7442d258ec..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp +++ /dev/null @@ -1,1934 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = K; - const long loops_count = ((K + 4) / 8) - 1; - K -= loops_count * 8; - const long regs_count = (K / 4) - 1; - K -= (regs_count + 1) * 4; - const long blocks_count = K / 1; - float nullbias[4]; - if (!accumulate && !bias) { - memset(nullbias, 0, (4 * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 8) { - if (rows_to_compute % 8) { - rows_to_compute = 8 - 1; - } else { - rows_to_compute = 8; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "ldr q24, [%[biasptr]]\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "ldr q0, [%[a_ptr0]]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "mov v25.16b, v24.16b\n" - "ldr q1, [a_ptr1]\n" - "ldr q16, [%[b_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 1f\n" - "2:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "ldr q9, [a_ptr1]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "ldr q16, [%[b_ptr0], #0x40]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "ldr q17, [%[b_ptr0], #0x50]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "ldr q18, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "b.ne 2b\n" - "1:\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "cbz %[regs], 3f\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q9, [a_ptr1]\n" - "ldr q16, [%[b_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "b 4f\n" - "3:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "4:\n" - "cbz %[blocks], 5f\n" - "6:\n" - "ldr q16, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "add %[b_ptr0], %[b_ptr0], #0x10\n" - "ldr s0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr s1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "b.ne 6b\n" - "5:\n" - "ld1r {v22.4s}, [%[minptr]]\n" - "ld1r {v23.4s}, [%[maxptr]]\n" - "fmax v24.4s, v24.4s, v22.4s\n" - "fmax v25.4s, v25.4s, v22.4s\n" - "fmin v24.4s, v24.4s, v23.4s\n" - "fmin v25.4s, v25.4s, v23.4s\n" - "str q24, [%[c_ptr0]]\n" - "add %[c_ptr0], %[c_ptr0], #0x10\n" - "str q25, [c_ptr1]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "ldr q24, [%[biasptr]]\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "ldr q0, [%[a_ptr0]]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "mov v25.16b, v24.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v26.16b, v24.16b\n" - "ldr q2, [a_ptr2]\n" - "ldr q16, [%[b_ptr0]]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 1f\n" - "2:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q9, [a_ptr1]\n" - "ldr q10, [a_ptr2]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "ldr q16, [%[b_ptr0], #0x40]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "ldr q17, [%[b_ptr0], #0x50]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "ldr q18, [%[b_ptr0], #0x60]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "b.ne 2b\n" - "1:\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "cbz %[regs], 3f\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q10, [a_ptr2]\n" - "ldr q16, [%[b_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "b 4f\n" - "3:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "4:\n" - "cbz %[blocks], 5f\n" - "6:\n" - "ldr q16, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "add %[b_ptr0], %[b_ptr0], #0x10\n" - "ldr s0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr s1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "b.ne 6b\n" - "5:\n" - "ld1r {v22.4s}, [%[minptr]]\n" - "ld1r {v23.4s}, [%[maxptr]]\n" - "fmax v24.4s, v24.4s, v22.4s\n" - "fmax v25.4s, v25.4s, v22.4s\n" - "fmax v26.4s, v26.4s, v22.4s\n" - "fmin v24.4s, v24.4s, v23.4s\n" - "fmin v25.4s, v25.4s, v23.4s\n" - "fmin v26.4s, v26.4s, v23.4s\n" - "str q24, [%[c_ptr0]]\n" - "add %[c_ptr0], %[c_ptr0], #0x10\n" - "str q25, [c_ptr1]\n" - "str q26, [c_ptr2]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "ldr q24, [%[biasptr]]\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "ldr q0, [%[a_ptr0]]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "mov v25.16b, v24.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v26.16b, v24.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v27.16b, v24.16b\n" - "ldr q16, [%[b_ptr0]]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "ldr q3, [a_ptr3]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 1f\n" - "2:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q11, [a_ptr3]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "ldr q16, [%[b_ptr0], #0x40]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "ldr q17, [%[b_ptr0], #0x50]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "ldr q18, [%[b_ptr0], #0x60]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "b.ne 2b\n" - "1:\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "cbz %[regs], 3f\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "b 4f\n" - "3:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "4:\n" - "cbz %[blocks], 5f\n" - "6:\n" - "ldr q16, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "add %[b_ptr0], %[b_ptr0], #0x10\n" - "ldr s0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr s1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "ldr s3, [a_ptr3]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "b.ne 6b\n" - "5:\n" - "ld1r {v22.4s}, [%[minptr]]\n" - "ld1r {v23.4s}, [%[maxptr]]\n" - "fmax v24.4s, v24.4s, v22.4s\n" - "fmax v25.4s, v25.4s, v22.4s\n" - "fmax v26.4s, v26.4s, v22.4s\n" - "fmax v27.4s, v27.4s, v22.4s\n" - "fmin v24.4s, v24.4s, v23.4s\n" - "fmin v25.4s, v25.4s, v23.4s\n" - "fmin v26.4s, v26.4s, v23.4s\n" - "fmin v27.4s, v27.4s, v23.4s\n" - "str q24, [%[c_ptr0]]\n" - "add %[c_ptr0], %[c_ptr0], #0x10\n" - "str q25, [c_ptr1]\n" - "str q26, [c_ptr2]\n" - "str q27, [c_ptr3]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - case 5: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "c_ptr1 .req X4\n" - "c_ptr2 .req X5\n" - "c_ptr3 .req X6\n" - "c_ptr4 .req X7\n" - "ldr q24, [%[biasptr]]\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "ldr q0, [%[a_ptr0]]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "mov v25.16b, v24.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v26.16b, v24.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v27.16b, v24.16b\n" - "ldr q16, [%[b_ptr0]]\n" - "mov v28.16b, v24.16b\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "ldr q4, [a_ptr4]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add a_ptr4, a_ptr4, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 1f\n" - "2:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q12, [a_ptr4]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "add a_ptr4, a_ptr4, #0x20\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "ldr q4, [a_ptr4, #-0x10]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "ldr q16, [%[b_ptr0], #0x40]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "ldr q17, [%[b_ptr0], #0x50]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "ldr q18, [%[b_ptr0], #0x60]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "b.ne 2b\n" - "1:\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "prfm PSTL1KEEP, [c_ptr4]\n" - "cbz %[regs], 3f\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q12, [a_ptr4]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "b 4f\n" - "3:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "4:\n" - "cbz %[blocks], 5f\n" - "6:\n" - "ldr q16, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "add %[b_ptr0], %[b_ptr0], #0x10\n" - "ldr s0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr s1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "ldr s3, [a_ptr3]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "ldr s4, [a_ptr4]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "add a_ptr4, a_ptr4, #0x4\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "b.ne 6b\n" - "5:\n" - "ld1r {v22.4s}, [%[minptr]]\n" - "ld1r {v23.4s}, [%[maxptr]]\n" - "fmax v24.4s, v24.4s, v22.4s\n" - "fmax v25.4s, v25.4s, v22.4s\n" - "fmax v26.4s, v26.4s, v22.4s\n" - "fmax v27.4s, v27.4s, v22.4s\n" - "fmin v24.4s, v24.4s, v23.4s\n" - "fmin v25.4s, v25.4s, v23.4s\n" - "fmin v26.4s, v26.4s, v23.4s\n" - "fmin v27.4s, v27.4s, v23.4s\n" - "str q24, [%[c_ptr0]]\n" - "fmax v28.4s, v28.4s, v22.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x10\n" - "str q25, [c_ptr1]\n" - "fmin v28.4s, v28.4s, v23.4s\n" - "str q26, [c_ptr2]\n" - "str q27, [c_ptr3]\n" - "str q28, [c_ptr4]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" - ); - break; - case 6: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "c_ptr1 .req X5\n" - "c_ptr2 .req X6\n" - "c_ptr3 .req X7\n" - "c_ptr4 .req X8\n" - "c_ptr5 .req X9\n" - "ldr q24, [%[biasptr]]\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "ldr q0, [%[a_ptr0]]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "mov v25.16b, v24.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v26.16b, v24.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v27.16b, v24.16b\n" - "ldr q16, [%[b_ptr0]]\n" - "mov v28.16b, v24.16b\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "mov v29.16b, v24.16b\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "ldr q4, [a_ptr4]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "ldr q5, [a_ptr5]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add a_ptr4, a_ptr4, #0x10\n" - "add a_ptr5, a_ptr5, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 1f\n" - "2:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "ldr q12, [a_ptr4]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q13, [a_ptr5]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "add a_ptr4, a_ptr4, #0x20\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "add a_ptr5, a_ptr5, #0x20\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "ldr q4, [a_ptr4, #-0x10]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "ldr q5, [a_ptr5, #-0x10]\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "fmla v29.4s, v16.4s, v13.s[0]\n" - "ldr q16, [%[b_ptr0], #0x40]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "fmla v29.4s, v17.4s, v13.s[1]\n" - "ldr q17, [%[b_ptr0], #0x50]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "fmla v29.4s, v18.4s, v13.s[2]\n" - "ldr q18, [%[b_ptr0], #0x60]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "fmla v29.4s, v19.4s, v13.s[3]\n" - "b.ne 2b\n" - "1:\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "prfm PSTL1KEEP, [c_ptr4]\n" - "prfm PSTL1KEEP, [c_ptr5]\n" - "cbz %[regs], 3f\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q12, [a_ptr4]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "ldr q13, [a_ptr5]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr5, a_ptr5, #0x10\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "fmla v29.4s, v16.4s, v13.s[0]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "fmla v29.4s, v17.4s, v13.s[1]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "fmla v29.4s, v18.4s, v13.s[2]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "fmla v29.4s, v19.4s, v13.s[3]\n" - "b 4f\n" - "3:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "4:\n" - "cbz %[blocks], 5f\n" - "6:\n" - "ldr q16, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "add %[b_ptr0], %[b_ptr0], #0x10\n" - "ldr s0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr s1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "ldr s3, [a_ptr3]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "ldr s4, [a_ptr4]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "add a_ptr4, a_ptr4, #0x4\n" - "ldr s5, [a_ptr5]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "add a_ptr5, a_ptr5, #0x4\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "b.ne 6b\n" - "5:\n" - "ld1r {v22.4s}, [%[minptr]]\n" - "ld1r {v23.4s}, [%[maxptr]]\n" - "fmax v24.4s, v24.4s, v22.4s\n" - "fmax v25.4s, v25.4s, v22.4s\n" - "fmax v26.4s, v26.4s, v22.4s\n" - "fmax v27.4s, v27.4s, v22.4s\n" - "fmin v24.4s, v24.4s, v23.4s\n" - "fmin v25.4s, v25.4s, v23.4s\n" - "fmin v26.4s, v26.4s, v23.4s\n" - "fmin v27.4s, v27.4s, v23.4s\n" - "str q24, [%[c_ptr0]]\n" - "fmax v28.4s, v28.4s, v22.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x10\n" - "fmax v29.4s, v29.4s, v22.4s\n" - "str q25, [c_ptr1]\n" - "fmin v28.4s, v28.4s, v23.4s\n" - "fmin v29.4s, v29.4s, v23.4s\n" - "str q26, [c_ptr2]\n" - "str q27, [c_ptr3]\n" - "str q28, [c_ptr4]\n" - "str q29, [c_ptr5]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" - ); - break; - case 7: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "c_ptr1 .req X6\n" - "c_ptr2 .req X7\n" - "c_ptr3 .req X8\n" - "c_ptr4 .req X9\n" - "c_ptr5 .req X10\n" - "c_ptr6 .req X11\n" - "ldr q24, [%[biasptr]]\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "ldr q0, [%[a_ptr0]]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "mov v25.16b, v24.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v26.16b, v24.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v27.16b, v24.16b\n" - "ldr q16, [%[b_ptr0]]\n" - "mov v28.16b, v24.16b\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "mov v29.16b, v24.16b\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "mov v30.16b, v24.16b\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "ldr q4, [a_ptr4]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "ldr q5, [a_ptr5]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "ldr q6, [a_ptr6]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add a_ptr4, a_ptr4, #0x10\n" - "add a_ptr5, a_ptr5, #0x10\n" - "add a_ptr6, a_ptr6, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 1f\n" - "2:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "ldr q12, [a_ptr4]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "ldr q13, [a_ptr5]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q14, [a_ptr6]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v30.4s, v17.4s, v6.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "add a_ptr4, a_ptr4, #0x20\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "add a_ptr5, a_ptr5, #0x20\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "add a_ptr6, a_ptr6, #0x20\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v30.4s, v18.4s, v6.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "ldr q4, [a_ptr4, #-0x10]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "ldr q5, [a_ptr5, #-0x10]\n" - "fmla v30.4s, v19.4s, v6.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "ldr q6, [a_ptr6, #-0x10]\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "fmla v29.4s, v16.4s, v13.s[0]\n" - "fmla v30.4s, v16.4s, v14.s[0]\n" - "ldr q16, [%[b_ptr0], #0x40]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "fmla v29.4s, v17.4s, v13.s[1]\n" - "fmla v30.4s, v17.4s, v14.s[1]\n" - "ldr q17, [%[b_ptr0], #0x50]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "fmla v29.4s, v18.4s, v13.s[2]\n" - "fmla v30.4s, v18.4s, v14.s[2]\n" - "ldr q18, [%[b_ptr0], #0x60]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "fmla v29.4s, v19.4s, v13.s[3]\n" - "fmla v30.4s, v19.4s, v14.s[3]\n" - "b.ne 2b\n" - "1:\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "prfm PSTL1KEEP, [c_ptr4]\n" - "prfm PSTL1KEEP, [c_ptr5]\n" - "prfm PSTL1KEEP, [c_ptr6]\n" - "cbz %[regs], 3f\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q12, [a_ptr4]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "ldr q13, [a_ptr5]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "ldr q14, [a_ptr6]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "fmla v30.4s, v17.4s, v6.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr5, a_ptr5, #0x10\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr6, a_ptr6, #0x10\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "fmla v30.4s, v18.4s, v6.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "fmla v30.4s, v19.4s, v6.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "fmla v29.4s, v16.4s, v13.s[0]\n" - "fmla v30.4s, v16.4s, v14.s[0]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "fmla v29.4s, v17.4s, v13.s[1]\n" - "fmla v30.4s, v17.4s, v14.s[1]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "fmla v29.4s, v18.4s, v13.s[2]\n" - "fmla v30.4s, v18.4s, v14.s[2]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "fmla v29.4s, v19.4s, v13.s[3]\n" - "fmla v30.4s, v19.4s, v14.s[3]\n" - "b 4f\n" - "3:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "fmla v30.4s, v17.4s, v6.s[1]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "fmla v30.4s, v18.4s, v6.s[2]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "fmla v30.4s, v19.4s, v6.s[3]\n" - "4:\n" - "cbz %[blocks], 5f\n" - "6:\n" - "ldr q16, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "add %[b_ptr0], %[b_ptr0], #0x10\n" - "ldr s0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr s1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "ldr s3, [a_ptr3]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "ldr s4, [a_ptr4]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "add a_ptr4, a_ptr4, #0x4\n" - "ldr s5, [a_ptr5]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "add a_ptr5, a_ptr5, #0x4\n" - "ldr s6, [a_ptr6]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "add a_ptr6, a_ptr6, #0x4\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "b.ne 6b\n" - "5:\n" - "ld1r {v22.4s}, [%[minptr]]\n" - "ld1r {v23.4s}, [%[maxptr]]\n" - "fmax v24.4s, v24.4s, v22.4s\n" - "fmax v25.4s, v25.4s, v22.4s\n" - "fmax v26.4s, v26.4s, v22.4s\n" - "fmax v27.4s, v27.4s, v22.4s\n" - "fmin v24.4s, v24.4s, v23.4s\n" - "fmin v25.4s, v25.4s, v23.4s\n" - "fmin v26.4s, v26.4s, v23.4s\n" - "fmin v27.4s, v27.4s, v23.4s\n" - "str q24, [%[c_ptr0]]\n" - "fmax v28.4s, v28.4s, v22.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x10\n" - "fmax v29.4s, v29.4s, v22.4s\n" - "str q25, [c_ptr1]\n" - "fmax v30.4s, v30.4s, v22.4s\n" - "fmin v28.4s, v28.4s, v23.4s\n" - "fmin v29.4s, v29.4s, v23.4s\n" - "str q26, [c_ptr2]\n" - "fmin v30.4s, v30.4s, v23.4s\n" - "str q27, [c_ptr3]\n" - "str q28, [c_ptr4]\n" - "str q29, [c_ptr5]\n" - "str q30, [c_ptr6]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory" - ); - break; - default: - case 8: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "ldr q24, [%[biasptr]]\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "ldr q0, [%[a_ptr0]]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "mov v25.16b, v24.16b\n" - "ldr q1, [a_ptr1]\n" - "mov v26.16b, v24.16b\n" - "ldr q2, [a_ptr2]\n" - "mov v27.16b, v24.16b\n" - "ldr q16, [%[b_ptr0]]\n" - "mov v28.16b, v24.16b\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "mov v29.16b, v24.16b\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "mov v30.16b, v24.16b\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "mov v31.16b, v24.16b\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "ldr q4, [a_ptr4]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "ldr q5, [a_ptr5]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "ldr q6, [a_ptr6]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "ldr q7, [a_ptr7]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add a_ptr4, a_ptr4, #0x10\n" - "add a_ptr5, a_ptr5, #0x10\n" - "add a_ptr6, a_ptr6, #0x10\n" - "add a_ptr7, a_ptr7, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "cbz %[loops], 1f\n" - "2:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "ldr q12, [a_ptr4]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "ldr q13, [a_ptr5]\n" - "fmla v31.4s, v16.4s, v7.s[0]\n" - "ldr q14, [a_ptr6]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q15, [a_ptr7]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla v30.4s, v17.4s, v6.s[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla v31.4s, v17.4s, v7.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr4, a_ptr4, #0x20\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "add a_ptr5, a_ptr5, #0x20\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "add a_ptr6, a_ptr6, #0x20\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "add a_ptr7, a_ptr7, #0x20\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "fmla v30.4s, v18.4s, v6.s[2]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - "fmla v31.4s, v18.4s, v7.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "ldr q4, [a_ptr4, #-0x10]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "ldr q5, [a_ptr5, #-0x10]\n" - "fmla v30.4s, v19.4s, v6.s[3]\n" - "ldr q6, [a_ptr6, #-0x10]\n" - "fmla v31.4s, v19.4s, v7.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "ldr q7, [a_ptr7, #-0x10]\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "fmla v29.4s, v16.4s, v13.s[0]\n" - "fmla v30.4s, v16.4s, v14.s[0]\n" - "fmla v31.4s, v16.4s, v15.s[0]\n" - "ldr q16, [%[b_ptr0], #0x40]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "fmla v29.4s, v17.4s, v13.s[1]\n" - "fmla v30.4s, v17.4s, v14.s[1]\n" - "fmla v31.4s, v17.4s, v15.s[1]\n" - "ldr q17, [%[b_ptr0], #0x50]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "fmla v29.4s, v18.4s, v13.s[2]\n" - "fmla v30.4s, v18.4s, v14.s[2]\n" - "fmla v31.4s, v18.4s, v15.s[2]\n" - "ldr q18, [%[b_ptr0], #0x60]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "fmla v29.4s, v19.4s, v13.s[3]\n" - "fmla v30.4s, v19.4s, v14.s[3]\n" - "fmla v31.4s, v19.4s, v15.s[3]\n" - "b.ne 2b\n" - "1:\n" - "ldr q19, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "prfm PSTL1KEEP, [c_ptr4]\n" - "prfm PSTL1KEEP, [c_ptr5]\n" - "prfm PSTL1KEEP, [c_ptr6]\n" - "prfm PSTL1KEEP, [c_ptr7]\n" - "cbz %[regs], 3f\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr q8, [%[a_ptr0]]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "ldr q9, [a_ptr1]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "ldr q10, [a_ptr2]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "ldr q11, [a_ptr3]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "ldr q12, [a_ptr4]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "ldr q13, [a_ptr5]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "ldr q14, [a_ptr6]\n" - "fmla v31.4s, v16.4s, v7.s[0]\n" - "ldr q15, [a_ptr7]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "ldr q16, [%[b_ptr0]]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "fmla v30.4s, v17.4s, v6.s[1]\n" - "add a_ptr5, a_ptr5, #0x10\n" - "fmla v31.4s, v17.4s, v7.s[1]\n" - "ldr q17, [%[b_ptr0], #0x10]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "add a_ptr6, a_ptr6, #0x10\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "add a_ptr7, a_ptr7, #0x10\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "fmla v30.4s, v18.4s, v6.s[2]\n" - "fmla v31.4s, v18.4s, v7.s[2]\n" - "ldr q18, [%[b_ptr0], #0x20]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "fmla v30.4s, v19.4s, v6.s[3]\n" - "fmla v31.4s, v19.4s, v7.s[3]\n" - "ldr q19, [%[b_ptr0], #0x30]\n" - "fmla v24.4s, v16.4s, v8.s[0]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - "fmla v25.4s, v16.4s, v9.s[0]\n" - "fmla v26.4s, v16.4s, v10.s[0]\n" - "fmla v27.4s, v16.4s, v11.s[0]\n" - "fmla v28.4s, v16.4s, v12.s[0]\n" - "fmla v29.4s, v16.4s, v13.s[0]\n" - "fmla v30.4s, v16.4s, v14.s[0]\n" - "fmla v31.4s, v16.4s, v15.s[0]\n" - "fmla v24.4s, v17.4s, v8.s[1]\n" - "fmla v25.4s, v17.4s, v9.s[1]\n" - "fmla v26.4s, v17.4s, v10.s[1]\n" - "fmla v27.4s, v17.4s, v11.s[1]\n" - "fmla v28.4s, v17.4s, v12.s[1]\n" - "fmla v29.4s, v17.4s, v13.s[1]\n" - "fmla v30.4s, v17.4s, v14.s[1]\n" - "fmla v31.4s, v17.4s, v15.s[1]\n" - "fmla v24.4s, v18.4s, v8.s[2]\n" - "fmla v25.4s, v18.4s, v9.s[2]\n" - "fmla v26.4s, v18.4s, v10.s[2]\n" - "fmla v27.4s, v18.4s, v11.s[2]\n" - "fmla v28.4s, v18.4s, v12.s[2]\n" - "fmla v29.4s, v18.4s, v13.s[2]\n" - "fmla v30.4s, v18.4s, v14.s[2]\n" - "fmla v31.4s, v18.4s, v15.s[2]\n" - "fmla v24.4s, v19.4s, v8.s[3]\n" - "fmla v25.4s, v19.4s, v9.s[3]\n" - "fmla v26.4s, v19.4s, v10.s[3]\n" - "fmla v27.4s, v19.4s, v11.s[3]\n" - "fmla v28.4s, v19.4s, v12.s[3]\n" - "fmla v29.4s, v19.4s, v13.s[3]\n" - "fmla v30.4s, v19.4s, v14.s[3]\n" - "fmla v31.4s, v19.4s, v15.s[3]\n" - "b 4f\n" - "3:\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "fmla v31.4s, v16.4s, v7.s[0]\n" - "fmla v24.4s, v17.4s, v0.s[1]\n" - "fmla v25.4s, v17.4s, v1.s[1]\n" - "fmla v26.4s, v17.4s, v2.s[1]\n" - "fmla v27.4s, v17.4s, v3.s[1]\n" - "fmla v28.4s, v17.4s, v4.s[1]\n" - "fmla v29.4s, v17.4s, v5.s[1]\n" - "fmla v30.4s, v17.4s, v6.s[1]\n" - "fmla v31.4s, v17.4s, v7.s[1]\n" - "fmla v24.4s, v18.4s, v0.s[2]\n" - "fmla v25.4s, v18.4s, v1.s[2]\n" - "fmla v26.4s, v18.4s, v2.s[2]\n" - "fmla v27.4s, v18.4s, v3.s[2]\n" - "fmla v28.4s, v18.4s, v4.s[2]\n" - "fmla v29.4s, v18.4s, v5.s[2]\n" - "fmla v30.4s, v18.4s, v6.s[2]\n" - "fmla v31.4s, v18.4s, v7.s[2]\n" - "fmla v24.4s, v19.4s, v0.s[3]\n" - "fmla v25.4s, v19.4s, v1.s[3]\n" - "fmla v26.4s, v19.4s, v2.s[3]\n" - "fmla v27.4s, v19.4s, v3.s[3]\n" - "fmla v28.4s, v19.4s, v4.s[3]\n" - "fmla v29.4s, v19.4s, v5.s[3]\n" - "fmla v30.4s, v19.4s, v6.s[3]\n" - "fmla v31.4s, v19.4s, v7.s[3]\n" - "4:\n" - "cbz %[blocks], 5f\n" - "6:\n" - "ldr q16, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "add %[b_ptr0], %[b_ptr0], #0x10\n" - "ldr s0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr s1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x4\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "ldr s2, [a_ptr2]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - "ldr s3, [a_ptr3]\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - "ldr s4, [a_ptr4]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "add a_ptr4, a_ptr4, #0x4\n" - "ldr s5, [a_ptr5]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "add a_ptr5, a_ptr5, #0x4\n" - "ldr s6, [a_ptr6]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "add a_ptr6, a_ptr6, #0x4\n" - "ldr s7, [a_ptr7]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "add a_ptr7, a_ptr7, #0x4\n" - "fmla v31.4s, v16.4s, v7.s[0]\n" - "b.ne 6b\n" - "5:\n" - "ld1r {v22.4s}, [%[minptr]]\n" - "ld1r {v23.4s}, [%[maxptr]]\n" - "fmax v24.4s, v24.4s, v22.4s\n" - "fmax v25.4s, v25.4s, v22.4s\n" - "fmax v26.4s, v26.4s, v22.4s\n" - "fmax v27.4s, v27.4s, v22.4s\n" - "fmin v24.4s, v24.4s, v23.4s\n" - "fmin v25.4s, v25.4s, v23.4s\n" - "fmin v26.4s, v26.4s, v23.4s\n" - "fmin v27.4s, v27.4s, v23.4s\n" - "str q24, [%[c_ptr0]]\n" - "fmax v28.4s, v28.4s, v22.4s\n" - "add %[c_ptr0], %[c_ptr0], #0x10\n" - "fmax v29.4s, v29.4s, v22.4s\n" - "str q25, [c_ptr1]\n" - "fmax v30.4s, v30.4s, v22.4s\n" - "fmin v28.4s, v28.4s, v23.4s\n" - "fmax v31.4s, v31.4s, v22.4s\n" - "str q26, [c_ptr2]\n" - "fmin v29.4s, v29.4s, v23.4s\n" - "fmin v30.4s, v30.4s, v23.4s\n" - "fmin v31.4s, v31.4s, v23.4s\n" - "str q27, [c_ptr3]\n" - "str q28, [c_ptr4]\n" - "str q29, [c_ptr5]\n" - "str q30, [c_ptr6]\n" - "str q31, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy, \ + size_t, size_t, \ + const float *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_fp32_mla_6x16( ARGLIST ); + +class cls_a64_hybrid_fp32_mla_6x16 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 2.00 }; + + case CPUModel::A53: + return { 1.43 }; + + case CPUModel::A73: + return { 2.56 }; + + default: + return { 6.26 }; + } + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_fp32_mla_6x16; + + cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp new file mode 100644 index 0000000000..884e8986c8 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp @@ -0,0 +1,3430 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 171f\n" + "cmp %x[M], #0x4\n" + "bgt 137f\n" + "beq 103f\n" + "cmp %x[M], #0x2\n" + "bgt 69f\n" + "beq 35f\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "cbz x14, 4f\n" + "ldr q8, [x14, #0x0]\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "ldr q11, [x14, #0x30]\n" + "add x14, x14, #0x40\n" + "b 15f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 14f\n" + "cmp x16, #0x10\n" + "bge 13f\n" + "tbz x16, #3, 8f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "tbz x16, #2, 6f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "tbz x16, #1, 5f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "tbz x16, #0, 12f\n" + "ld1 { v11.s }[2], [x13]\n" + "b 12f\n" + "5:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 12f\n" + "ldr s11, [x13, #0x0]\n" + "b 12f\n" + "6:" // Height 1: Partial accumulate: partial_2_8 + "tbz x16, #1, 7f\n" + "ldr d10, [x13], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 12f\n" + "ld1 { v10.s }[2], [x13]\n" + "b 12f\n" + "7:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 12f\n" + "ldr s10, [x13, #0x0]\n" + "b 12f\n" + "8:" // Height 1: Partial accumulate: partial_4_0 + "tbz x16, #2, 10f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "tbz x16, #1, 9f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "tbz x16, #0, 12f\n" + "ld1 { v9.s }[2], [x13]\n" + "b 12f\n" + "9:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 12f\n" + "ldr s9, [x13, #0x0]\n" + "b 12f\n" + "10:" // Height 1: Partial accumulate: partial_2_0 + "tbz x16, #1, 11f\n" + "ldr d8, [x13], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 12f\n" + "ld1 { v8.s }[2], [x13]\n" + "b 12f\n" + "11:" // Height 1: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "12:" // Height 1: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 15f\n" + "13:" // Height 1: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "b 15f\n" + "14:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "15:" // Height 1: setup done + "mov x12, #0x0\n" + "16:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 18f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "b 18f\n" + "17:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "18:" // Height 1: input setup done + "cmp x11, #0x4\n" + "blt 21f\n" + "cmp x11, #0x8\n" + "blt 20f\n" + "19:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "add x10, x10, #0x10\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "sub x11, x11, #0x4\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "cmp x11, #0x8\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "bge 19b\n" + "20:" // Height 1: Multiply loop: Single iteration only + "sub x11, x11, #0x4\n" + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "add x10, x10, #0x10\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "21:" // Height 1: Multiply loop: Main loop skip + "cbz x11, 23f\n" + "22:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "sub x11, x11, #0x1\n" + "add x15, x15, #0x40\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "cbnz x11, 22b\n" + "23:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 16b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 24f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "24:" // Height 1: No activation + "cmp x16, #0x10\n" + "bge 33f\n" + "tbz x16, #3, 28f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "tbz x16, #2, 26f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "tbz x16, #1, 25f\n" + "str d11, [x13], #0x8\n" + "tbz x16, #0, 32f\n" + "st1 { v11.s }[2], [x13]\n" + "b 32f\n" + "25:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x16, #0, 32f\n" + "str s11, [x13, #0x0]\n" + "b 32f\n" + "26:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x16, #1, 27f\n" + "str d10, [x13], #0x8\n" + "tbz x16, #0, 32f\n" + "st1 { v10.s }[2], [x13]\n" + "b 32f\n" + "27:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x16, #0, 32f\n" + "str s10, [x13, #0x0]\n" + "b 32f\n" + "28:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x16, #2, 30f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "tbz x16, #1, 29f\n" + "str d9, [x13], #0x8\n" + "tbz x16, #0, 32f\n" + "st1 { v9.s }[2], [x13]\n" + "b 32f\n" + "29:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x16, #0, 32f\n" + "str s9, [x13, #0x0]\n" + "b 32f\n" + "30:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x16, #1, 31f\n" + "str d8, [x13], #0x8\n" + "tbz x16, #0, 32f\n" + "st1 { v8.s }[2], [x13]\n" + "b 32f\n" + "31:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "32:" // Height 1: Partial direct writeback: Done + "b 34f\n" + "33:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "34:" // Height 1: Writeback done + "subs x16, x16, #0x10\n" + "bgt 3b\n" + "b 206f\n" + "35:" // Height 2 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 36f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 37f\n" + "36:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "37:" // Height 2: Column loop + "cbz x14, 38f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "ldr q10, [x14, #0x20]\n" + "mov v13.16b, v9.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v14.16b, v10.16b\n" + "add x14, x14, #0x40\n" + "mov v15.16b, v11.16b\n" + "b 49f\n" + "38:" // Height 2: no bias + "tbz %x[flags], #0, 48f\n" + "cmp x16, #0x10\n" + "bge 47f\n" + "tbz x16, #3, 42f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "tbz x16, #2, 40f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "tbz x16, #1, 39f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "tbz x16, #0, 46f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "b 46f\n" + "39:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 46f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "b 46f\n" + "40:" // Height 2: Partial accumulate: partial_2_8 + "tbz x16, #1, 41f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 46f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "b 46f\n" + "41:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 46f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "b 46f\n" + "42:" // Height 2: Partial accumulate: partial_4_0 + "tbz x16, #2, 44f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "tbz x16, #1, 43f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "tbz x16, #0, 46f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "b 46f\n" + "43:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 46f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "b 46f\n" + "44:" // Height 2: Partial accumulate: partial_2_0 + "tbz x16, #1, 45f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 46f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "b 46f\n" + "45:" // Height 2: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "46:" // Height 2: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "b 49f\n" + "47:" // Height 2: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "b 49f\n" + "48:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "49:" // Height 2: setup done + "mov x12, #0x0\n" + "50:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 51f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "b 52f\n" + "51:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "52:" // Height 2: input setup done + "cmp x11, #0x4\n" + "blt 55f\n" + "cmp x11, #0x8\n" + "blt 54f\n" + "53:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "add x10, x10, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "sub x11, x11, #0x4\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "cmp x11, #0x8\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "bge 53b\n" + "54:" // Height 2: Multiply loop: Single iteration only + "sub x11, x11, #0x4\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "add x10, x10, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "55:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 57f\n" + "56:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "sub x11, x11, #0x1\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "cbnz x11, 56b\n" + "57:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 50b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 58f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "58:" // Height 2: No activation + "cmp x16, #0x10\n" + "bge 67f\n" + "tbz x16, #3, 62f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "tbz x16, #2, 60f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "tbz x16, #1, 59f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "tbz x16, #0, 66f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "b 66f\n" + "59:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x16, #0, 66f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "b 66f\n" + "60:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x16, #1, 61f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "tbz x16, #0, 66f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "b 66f\n" + "61:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x16, #0, 66f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "b 66f\n" + "62:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x16, #2, 64f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "tbz x16, #1, 63f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "tbz x16, #0, 66f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "b 66f\n" + "63:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x16, #0, 66f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "b 66f\n" + "64:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x16, #1, 65f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "tbz x16, #0, 66f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "b 66f\n" + "65:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "66:" // Height 2: Partial direct writeback: Done + "b 68f\n" + "67:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "68:" // Height 2: Writeback done + "subs x16, x16, #0x10\n" + "bgt 37b\n" + "b 206f\n" + "69:" // Height 3 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 70f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 71f\n" + "70:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "71:" // Height 3: Column loop + "cbz x14, 72f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "ldr q11, [x14, #0x30]\n" + "mov v13.16b, v9.16b\n" + "add x14, x14, #0x40\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "b 83f\n" + "72:" // Height 3: no bias + "tbz %x[flags], #0, 82f\n" + "cmp x16, #0x10\n" + "bge 81f\n" + "tbz x16, #3, 76f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "tbz x16, #2, 74f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "tbz x16, #1, 73f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "tbz x16, #0, 80f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "b 80f\n" + "73:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 80f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "b 80f\n" + "74:" // Height 3: Partial accumulate: partial_2_8 + "tbz x16, #1, 75f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 80f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "b 80f\n" + "75:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 80f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "b 80f\n" + "76:" // Height 3: Partial accumulate: partial_4_0 + "tbz x16, #2, 78f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "tbz x16, #1, 77f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "tbz x16, #0, 80f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "b 80f\n" + "77:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 80f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "b 80f\n" + "78:" // Height 3: Partial accumulate: partial_2_0 + "tbz x16, #1, 79f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 80f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "b 80f\n" + "79:" // Height 3: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "80:" // Height 3: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "b 83f\n" + "81:" // Height 3: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "b 83f\n" + "82:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "83:" // Height 3: setup done + "mov x12, #0x0\n" + "84:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 85f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 86f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 86f\n" + "85:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "86:" // Height 3: input setup done + "cmp x11, #0x4\n" + "blt 89f\n" + "cmp x11, #0x8\n" + "blt 88f\n" + "87:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x26, x26, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "sub x11, x11, #0x4\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "cmp x11, #0x8\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "bge 87b\n" + "88:" // Height 3: Multiply loop: Single iteration only + "sub x11, x11, #0x4\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x26, x26, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "89:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 91f\n" + "90:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "cbnz x11, 90b\n" + "91:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 84b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 92f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "92:" // Height 3: No activation + "cmp x16, #0x10\n" + "bge 101f\n" + "tbz x16, #3, 96f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "tbz x16, #2, 94f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "tbz x16, #1, 93f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "tbz x16, #0, 100f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "b 100f\n" + "93:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x16, #0, 100f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "b 100f\n" + "94:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x16, #1, 95f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "tbz x16, #0, 100f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "b 100f\n" + "95:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x16, #0, 100f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "b 100f\n" + "96:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x16, #2, 98f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "tbz x16, #1, 97f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "tbz x16, #0, 100f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "b 100f\n" + "97:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x16, #0, 100f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "b 100f\n" + "98:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x16, #1, 99f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "tbz x16, #0, 100f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "b 100f\n" + "99:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "100:" // Height 3: Partial direct writeback: Done + "b 102f\n" + "101:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "102:" // Height 3: Writeback done + "subs x16, x16, #0x10\n" + "bgt 71b\n" + "b 206f\n" + "103:" // Height 4 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 104f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 105f\n" + "104:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "105:" // Height 4: Column loop + "cbz x14, 106f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "add x14, x14, #0x40\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "b 117f\n" + "106:" // Height 4: no bias + "tbz %x[flags], #0, 116f\n" + "cmp x16, #0x10\n" + "bge 115f\n" + "tbz x16, #3, 110f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "tbz x16, #2, 108f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "tbz x16, #1, 107f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "tbz x16, #0, 114f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "b 114f\n" + "107:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 114f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "b 114f\n" + "108:" // Height 4: Partial accumulate: partial_2_8 + "tbz x16, #1, 109f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 114f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "b 114f\n" + "109:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 114f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "b 114f\n" + "110:" // Height 4: Partial accumulate: partial_4_0 + "tbz x16, #2, 112f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "tbz x16, #1, 111f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "tbz x16, #0, 114f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "b 114f\n" + "111:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 114f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "b 114f\n" + "112:" // Height 4: Partial accumulate: partial_2_0 + "tbz x16, #1, 113f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 114f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "b 114f\n" + "113:" // Height 4: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "114:" // Height 4: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "b 117f\n" + "115:" // Height 4: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "b 117f\n" + "116:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "117:" // Height 4: setup done + "mov x12, #0x0\n" + "118:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 119f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 120f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 120f\n" + "119:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "120:" // Height 4: input setup done + "cmp x11, #0x4\n" + "blt 123f\n" + "cmp x11, #0x8\n" + "blt 122f\n" + "121:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x11, x11, #0x4\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "cmp x11, #0x8\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "bge 121b\n" + "122:" // Height 4: Multiply loop: Single iteration only + "sub x11, x11, #0x4\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "add x15, x15, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "123:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 125f\n" + "124:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "cbnz x11, 124b\n" + "125:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 118b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 126f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "126:" // Height 4: No activation + "cmp x16, #0x10\n" + "bge 135f\n" + "tbz x16, #3, 130f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "tbz x16, #2, 128f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "tbz x16, #1, 127f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "tbz x16, #0, 134f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "b 134f\n" + "127:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x16, #0, 134f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "b 134f\n" + "128:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x16, #1, 129f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "tbz x16, #0, 134f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "b 134f\n" + "129:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x16, #0, 134f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "b 134f\n" + "130:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x16, #2, 132f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "tbz x16, #1, 131f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "tbz x16, #0, 134f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "b 134f\n" + "131:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x16, #0, 134f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "b 134f\n" + "132:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x16, #1, 133f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x16, #0, 134f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "b 134f\n" + "133:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "134:" // Height 4: Partial direct writeback: Done + "b 136f\n" + "135:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "136:" // Height 4: Writeback done + "subs x16, x16, #0x10\n" + "bgt 105b\n" + "b 206f\n" + "137:" // Height 5 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 138f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 139f\n" + "138:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "139:" // Height 5: Column loop + "cbz x14, 140f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v24.16b, v8.16b\n" + "add x14, x14, #0x40\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "b 151f\n" + "140:" // Height 5: no bias + "tbz %x[flags], #0, 150f\n" + "cmp x16, #0x10\n" + "bge 149f\n" + "tbz x16, #3, 144f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "tbz x16, #2, 142f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "tbz x16, #1, 141f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "tbz x16, #0, 148f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "b 148f\n" + "141:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 148f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "b 148f\n" + "142:" // Height 5: Partial accumulate: partial_2_8 + "tbz x16, #1, 143f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 148f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "b 148f\n" + "143:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 148f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "b 148f\n" + "144:" // Height 5: Partial accumulate: partial_4_0 + "tbz x16, #2, 146f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "tbz x16, #1, 145f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "tbz x16, #0, 148f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "b 148f\n" + "145:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 148f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "b 148f\n" + "146:" // Height 5: Partial accumulate: partial_2_0 + "tbz x16, #1, 147f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 148f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "b 148f\n" + "147:" // Height 5: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "148:" // Height 5: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "b 151f\n" + "149:" // Height 5: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "b 151f\n" + "150:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "151:" // Height 5: setup done + "mov x12, #0x0\n" + "152:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 153f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 154f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 154f\n" + "153:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "154:" // Height 5: input setup done + "cmp x11, #0x4\n" + "blt 157f\n" + "cmp x11, #0x8\n" + "blt 156f\n" + "155:" // Height 5: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x22, x22, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x11, x11, #0x4\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "cmp x11, #0x8\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x15, x15, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "bge 155b\n" + "156:" // Height 5: Multiply loop: Single iteration only + "sub x11, x11, #0x4\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x22, x22, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x15, x15, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "157:" // Height 5: Multiply loop: Main loop skip + "cbz x11, 159f\n" + "158:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "cbnz x11, 158b\n" + "159:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 152b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 160f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "160:" // Height 5: No activation + "cmp x16, #0x10\n" + "bge 169f\n" + "tbz x16, #3, 164f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "tbz x16, #2, 162f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "tbz x16, #1, 161f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "tbz x16, #0, 168f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "b 168f\n" + "161:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x16, #0, 168f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "b 168f\n" + "162:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x16, #1, 163f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "tbz x16, #0, 168f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "b 168f\n" + "163:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x16, #0, 168f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "b 168f\n" + "164:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x16, #2, 166f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "tbz x16, #1, 165f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "tbz x16, #0, 168f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "b 168f\n" + "165:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x16, #0, 168f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "b 168f\n" + "166:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x16, #1, 167f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x16, #0, 168f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "b 168f\n" + "167:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "168:" // Height 5: Partial direct writeback: Done + "b 170f\n" + "169:" // Height 5: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "170:" // Height 5: Writeback done + "subs x16, x16, #0x10\n" + "bgt 139b\n" + "b 206f\n" + "171:" // Height 6 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 172f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 173f\n" + "172:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "173:" // Height 6: Column loop + "cbz x14, 174f\n" + "ldr q8, [x14, #0x0]\n" + "mov v12.16b, v8.16b\n" + "ldr q9, [x14, #0x10]\n" + "mov v16.16b, v8.16b\n" + "ldr q10, [x14, #0x20]\n" + "mov v20.16b, v8.16b\n" + "ldr q11, [x14, #0x30]\n" + "mov v24.16b, v8.16b\n" + "add x14, x14, #0x40\n" + "mov v28.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "mov v29.16b, v9.16b\n" + "mov v30.16b, v10.16b\n" + "mov v31.16b, v11.16b\n" + "b 185f\n" + "174:" // Height 6: no bias + "tbz %x[flags], #0, 184f\n" + "cmp x16, #0x10\n" + "bge 183f\n" + "tbz x16, #3, 178f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "ld1 { v29.4s }, [x21], #0x10\n" + "tbz x16, #2, 176f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "ld1 { v30.4s }, [x21], #0x10\n" + "tbz x16, #1, 175f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x16, #0, 182f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 182f\n" + "175:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x16, #0, 182f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "b 182f\n" + "176:" // Height 6: Partial accumulate: partial_2_8 + "tbz x16, #1, 177f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d30, [x21], #0x8\n" + "mov x19, #0x28\n" + "tbz x16, #0, 182f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "ld1 { v30.s }[2], [x21]\n" + "b 182f\n" + "177:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x16, #0, 182f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "b 182f\n" + "178:" // Height 6: Partial accumulate: partial_4_0 + "tbz x16, #2, 180f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "tbz x16, #1, 179f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x16, #0, 182f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x21]\n" + "b 182f\n" + "179:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x16, #0, 182f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "ldr s29, [x21, #0x0]\n" + "b 182f\n" + "180:" // Height 6: Partial accumulate: partial_2_0 + "tbz x16, #1, 181f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d28, [x21], #0x8\n" + "mov x19, #0x8\n" + "tbz x16, #0, 182f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v28.s }[2], [x21]\n" + "b 182f\n" + "181:" // Height 6: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s28, [x21, #0x0]\n" + "182:" // Height 6: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "sub x21, x21, x19\n" + "b 185f\n" + "183:" // Height 6: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "ldr q28, [x21, #0x0]\n" + "ldr q29, [x21, #0x10]\n" + "ldr q30, [x21, #0x20]\n" + "ldr q31, [x21, #0x30]\n" + "b 185f\n" + "184:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "185:" // Height 6: setup done + "mov x12, #0x0\n" + "186:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 187f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 188f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 188f\n" + "187:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "add x20, x22, x19, LSL #2\n" + "188:" // Height 6: input setup done + "cmp x11, #0x4\n" + "blt 191f\n" + "cmp x11, #0x8\n" + "blt 190f\n" + "189:" // Height 6: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x20, x20, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x11, x11, #0x4\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "cmp x11, #0x8\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "fmla v29.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "fmla v30.4s, v6.4s, v5.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v31.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "fmla v28.4s, v6.4s, v5.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "fmla v29.4s, v7.4s, v5.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "fmla v30.4s, v6.4s, v5.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "fmla v31.4s, v7.4s, v5.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "fmla v28.4s, v6.4s, v5.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "fmla v29.4s, v7.4s, v5.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "fmla v30.4s, v6.4s, v5.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "fmla v31.4s, v7.4s, v5.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "fmla v28.4s, v6.4s, v5.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "fmla v29.4s, v7.4s, v5.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x15, x15, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v30.4s, v6.4s, v5.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v31.4s, v7.4s, v5.s[3]\n" + "bge 189b\n" + "190:" // Height 6: Multiply loop: Single iteration only + "sub x11, x11, #0x4\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x20, x20, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "fmla v29.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "fmla v30.4s, v6.4s, v5.s[0]\n" + "ldr q6, [x15, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v31.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x15, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "fmla v28.4s, v6.4s, v5.s[1]\n" + "ldr q6, [x15, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "fmla v29.4s, v7.4s, v5.s[1]\n" + "ldr q7, [x15, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "fmla v30.4s, v6.4s, v5.s[1]\n" + "ldr q6, [x15, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "fmla v31.4s, v7.4s, v5.s[1]\n" + "ldr q7, [x15, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "fmla v28.4s, v6.4s, v5.s[2]\n" + "ldr q6, [x15, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "fmla v29.4s, v7.4s, v5.s[2]\n" + "ldr q7, [x15, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "fmla v30.4s, v6.4s, v5.s[2]\n" + "ldr q6, [x15, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "fmla v31.4s, v7.4s, v5.s[2]\n" + "ldr q7, [x15, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "fmla v28.4s, v6.4s, v5.s[3]\n" + "ldr q6, [x15, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "fmla v29.4s, v7.4s, v5.s[3]\n" + "ldr q7, [x15, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x15, x15, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v30.4s, v6.4s, v5.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v31.4s, v7.4s, v5.s[3]\n" + "191:" // Height 6: Multiply loop: Main loop skip + "cbz x11, 193f\n" + "192:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x15, #0x0]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x15, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x11, x11, #0x1\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "ldr q6, [x15, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "fmla v29.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x15, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x15, x15, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "fmla v30.4s, v6.4s, v5.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v31.4s, v7.4s, v5.s[0]\n" + "cbnz x11, 192b\n" + "193:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 186b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 194f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "194:" // Height 6: No activation + "cmp x16, #0x10\n" + "bge 203f\n" + "tbz x16, #3, 198f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "tbz x16, #2, 196f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "st1 { v30.4s }, [x21], #0x10\n" + "tbz x16, #1, 195f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x16, #0, 202f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "st1 { v31.s }[2], [x21]\n" + "b 202f\n" + "195:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x16, #0, 202f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "str s31, [x21, #0x0]\n" + "b 202f\n" + "196:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x16, #1, 197f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x16, #0, 202f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "st1 { v30.s }[2], [x21]\n" + "b 202f\n" + "197:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x16, #0, 202f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "str s30, [x21, #0x0]\n" + "b 202f\n" + "198:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x16, #2, 200f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "tbz x16, #1, 199f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x16, #0, 202f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "st1 { v29.s }[2], [x21]\n" + "b 202f\n" + "199:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x16, #0, 202f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s29, [x21, #0x0]\n" + "b 202f\n" + "200:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x16, #1, 201f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x16, #0, 202f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "st1 { v28.s }[2], [x21]\n" + "b 202f\n" + "201:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "str s28, [x21, #0x0]\n" + "202:" // Height 6: Partial direct writeback: Done + "b 204f\n" + "203:" // Height 6: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "str q28, [x21, #0x0]\n" + "str q29, [x21, #0x10]\n" + "str q30, [x21, #0x20]\n" + "str q31, [x21, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "add x21, x21, #0x40\n" + "204:" // Height 6: Writeback done + "subs x16, x16, #0x10\n" + "bgt 173b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 206f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 205f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "205:" // Update direct input + "mov x19, #0x18\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "206:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp new file mode 100644 index 0000000000..043d0643f0 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const float *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_fp32_mla_8x4( ARGLIST ); + +class cls_a64_hybrid_fp32_mla_8x4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return 4; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_fp32_mla_8x4; + + cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp new file mode 100644 index 0000000000..3ab6cad368 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp @@ -0,0 +1,2195 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_8x4 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x8\n" + "bge 155f\n" + "cmp %x[M], #0x6\n" + "bgt 133f\n" + "beq 111f\n" + "cmp %x[M], #0x4\n" + "bgt 89f\n" + "beq 67f\n" + "cmp %x[M], #0x2\n" + "bgt 45f\n" + "beq 23f\n" + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x8, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x17, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "cbz x8, 4f\n" + "ldr q24, [x8, #0x0]\n" + "add x8, x8, #0x10\n" + "b 9f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 8f\n" + "cmp x6, #0x4\n" + "bge 7f\n" + "tbz x6, #1, 5f\n" + "ldr d24, [x17], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 6f\n" + "ld1 { v24.s }[2], [x17]\n" + "b 6f\n" + "5:" // Height 1: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "6:" // Height 1: Partial accumulate: Done + "sub x17, x17, x19\n" + "b 9f\n" + "7:" // Height 1: full accumulate + "ldr q24, [x17, #0x0]\n" + "b 9f\n" + "8:" // Height 1: no accumulate + "movi v24.16b, #0x0\n" + "9:" // Height 1: setup done + "mov x16, #0x0\n" + "10:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 11f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "cbnz x16, 12f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "b 12f\n" + "11:" // Height 1: setup direct input + "mov x14, %x[input_ptr]\n" + "12:" // Height 1: input setup done + "cmp x15, #0x4\n" + "blt 15f\n" + "cmp x15, #0x8\n" + "blt 14f\n" + "13:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "ldr q10, [x7, #0x20]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "ldr q11, [x7, #0x30]\n" + "add x14, x14, #0x10\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "cmp x15, #0x8\n" + "add x7, x7, #0x40\n" + "bge 13b\n" + "14:" // Height 1: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "ldr q14, [x7, #0x20]\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "ldr q15, [x7, #0x30]\n" + "add x14, x14, #0x10\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "15:" // Height 1: Multiply loop: Main loop skip + "cbz x15, 17f\n" + "16:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "add x7, x7, #0x10\n" + "cbnz x15, 16b\n" + "17:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 10b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "tbz %x[flags], #1, 18f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "18:" // Height 1: No activation + "cmp x6, #0x4\n" + "bge 21f\n" + "tbz x6, #1, 19f\n" + "str d24, [x17], #0x8\n" + "tbz x6, #0, 20f\n" + "st1 { v24.s }[2], [x17]\n" + "b 20f\n" + "19:" // Height 1: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "20:" // Height 1: Partial direct writeback: Done + "b 22f\n" + "21:" // Height 1: Full writeback + "str q24, [x17, #0x0]\n" + "add x17, x17, #0x10\n" + "22:" // Height 1: Writeback done + "subs x6, x6, #0x4\n" + "bgt 3b\n" + "b 178f\n" + "23:" // Height 2 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 24f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "add x13, x13, x19, LSL #2\n" + "b 25f\n" + "24:" // Height 2: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "25:" // Height 2: Column loop + "cbz x8, 26f\n" + "ldr q24, [x8, #0x0]\n" + "mov v25.16b, v24.16b\n" + "add x8, x8, #0x10\n" + "b 31f\n" + "26:" // Height 2: no bias + "tbz %x[flags], #0, 30f\n" + "cmp x6, #0x4\n" + "bge 29f\n" + "tbz x6, #1, 27f\n" + "ldr d24, [x17], #0x8\n" + "ldr d25, [x13], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 28f\n" + "ld1 { v24.s }[2], [x17]\n" + "ld1 { v25.s }[2], [x13]\n" + "b 28f\n" + "27:" // Height 2: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "ldr s25, [x13, #0x0]\n" + "28:" // Height 2: Partial accumulate: Done + "sub x17, x17, x19\n" + "sub x13, x13, x19\n" + "b 31f\n" + "29:" // Height 2: full accumulate + "ldr q24, [x17, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "b 31f\n" + "30:" // Height 2: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "31:" // Height 2: setup done + "mov x16, #0x0\n" + "32:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "cbnz x16, 34f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "b 34f\n" + "33:" // Height 2: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "34:" // Height 2: input setup done + "cmp x15, #0x4\n" + "blt 37f\n" + "cmp x15, #0x8\n" + "blt 36f\n" + "35:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x7, #0x20]\n" + "ldr q11, [x7, #0x30]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "add x12, x12, #0x10\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "cmp x15, #0x8\n" + "add x7, x7, #0x40\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "bge 35b\n" + "36:" // Height 2: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q14, [x7, #0x20]\n" + "ldr q15, [x7, #0x30]\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v25.4s, v13.4s, v1.s[1]\n" + "add x12, x12, #0x10\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v25.4s, v14.4s, v1.s[2]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "fmla v25.4s, v15.4s, v1.s[3]\n" + "37:" // Height 2: Multiply loop: Main loop skip + "cbz x15, 39f\n" + "38:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr s1, [x12], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "fmla v25.4s, v16.4s, v1.s[0]\n" + "add x7, x7, #0x10\n" + "cbnz x15, 38b\n" + "39:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 32b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 40f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "40:" // Height 2: No activation + "cmp x6, #0x4\n" + "bge 43f\n" + "tbz x6, #1, 41f\n" + "str d24, [x17], #0x8\n" + "str d25, [x13], #0x8\n" + "tbz x6, #0, 42f\n" + "st1 { v24.s }[2], [x17]\n" + "st1 { v25.s }[2], [x13]\n" + "b 42f\n" + "41:" // Height 2: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "str s25, [x13, #0x0]\n" + "42:" // Height 2: Partial direct writeback: Done + "b 44f\n" + "43:" // Height 2: Full writeback + "str q24, [x17, #0x0]\n" + "str q25, [x13, #0x0]\n" + "add x17, x17, #0x10\n" + "add x13, x13, #0x10\n" + "44:" // Height 2: Writeback done + "subs x6, x6, #0x4\n" + "bgt 25b\n" + "b 178f\n" + "45:" // Height 3 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 46f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "add x11, x11, x19, LSL #2\n" + "b 47f\n" + "46:" // Height 3: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "47:" // Height 3: Column loop + "cbz x8, 48f\n" + "ldr q24, [x8, #0x0]\n" + "mov v25.16b, v24.16b\n" + "add x8, x8, #0x10\n" + "mov v26.16b, v24.16b\n" + "b 53f\n" + "48:" // Height 3: no bias + "tbz %x[flags], #0, 52f\n" + "cmp x6, #0x4\n" + "bge 51f\n" + "tbz x6, #1, 49f\n" + "ldr d24, [x17], #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x11], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 50f\n" + "ld1 { v24.s }[2], [x17]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x11]\n" + "b 50f\n" + "49:" // Height 3: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x11, #0x0]\n" + "50:" // Height 3: Partial accumulate: Done + "sub x17, x17, x19\n" + "sub x13, x13, x19\n" + "sub x11, x11, x19\n" + "b 53f\n" + "51:" // Height 3: full accumulate + "ldr q24, [x17, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x11, #0x0]\n" + "b 53f\n" + "52:" // Height 3: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "53:" // Height 3: setup done + "mov x16, #0x0\n" + "54:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 55f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "cbnz x16, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "b 56f\n" + "55:" // Height 3: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "56:" // Height 3: input setup done + "cmp x15, #0x4\n" + "blt 59f\n" + "cmp x15, #0x8\n" + "blt 58f\n" + "57:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x7, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x7, #0x30]\n" + "add x14, x14, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "add x12, x12, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "cmp x15, #0x8\n" + "add x7, x7, #0x40\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "bge 57b\n" + "58:" // Height 3: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q14, [x7, #0x20]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "ldr q15, [x7, #0x30]\n" + "add x14, x14, #0x10\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "add x12, x12, #0x10\n" + "fmla v25.4s, v13.4s, v1.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v26.4s, v13.4s, v2.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "fmla v25.4s, v14.4s, v1.s[2]\n" + "fmla v26.4s, v14.4s, v2.s[2]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "fmla v25.4s, v15.4s, v1.s[3]\n" + "fmla v26.4s, v15.4s, v2.s[3]\n" + "59:" // Height 3: Multiply loop: Main loop skip + "cbz x15, 61f\n" + "60:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr s1, [x12], #0x4\n" + "ldr s2, [x10], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "fmla v25.4s, v16.4s, v1.s[0]\n" + "add x7, x7, #0x10\n" + "fmla v26.4s, v16.4s, v2.s[0]\n" + "cbnz x15, 60b\n" + "61:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 54b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "tbz %x[flags], #1, 62f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "62:" // Height 3: No activation + "cmp x6, #0x4\n" + "bge 65f\n" + "tbz x6, #1, 63f\n" + "str d24, [x17], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x11], #0x8\n" + "tbz x6, #0, 64f\n" + "st1 { v24.s }[2], [x17]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x11]\n" + "b 64f\n" + "63:" // Height 3: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x11, #0x0]\n" + "64:" // Height 3: Partial direct writeback: Done + "b 66f\n" + "65:" // Height 3: Full writeback + "str q24, [x17, #0x0]\n" + "str q25, [x13, #0x0]\n" + "str q26, [x11, #0x0]\n" + "add x17, x17, #0x10\n" + "add x13, x13, #0x10\n" + "add x11, x11, #0x10\n" + "66:" // Height 3: Writeback done + "subs x6, x6, #0x4\n" + "bgt 47b\n" + "b 178f\n" + "67:" // Height 4 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 68f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "b 69f\n" + "68:" // Height 4: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "69:" // Height 4: Column loop + "cbz x8, 70f\n" + "ldr q24, [x8, #0x0]\n" + "mov v25.16b, v24.16b\n" + "add x8, x8, #0x10\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "b 75f\n" + "70:" // Height 4: no bias + "tbz %x[flags], #0, 74f\n" + "cmp x6, #0x4\n" + "bge 73f\n" + "tbz x6, #1, 71f\n" + "ldr d24, [x17], #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x11], #0x8\n" + "ldr d27, [x9], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 72f\n" + "ld1 { v24.s }[2], [x17]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x11]\n" + "ld1 { v27.s }[2], [x9]\n" + "b 72f\n" + "71:" // Height 4: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x11, #0x0]\n" + "ldr s27, [x9, #0x0]\n" + "72:" // Height 4: Partial accumulate: Done + "sub x17, x17, x19\n" + "sub x13, x13, x19\n" + "sub x11, x11, x19\n" + "sub x9, x9, x19\n" + "b 75f\n" + "73:" // Height 4: full accumulate + "ldr q24, [x17, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x11, #0x0]\n" + "ldr q27, [x9, #0x0]\n" + "b 75f\n" + "74:" // Height 4: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "75:" // Height 4: setup done + "mov x16, #0x0\n" + "76:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 77f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "cbnz x16, 78f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "b 78f\n" + "77:" // Height 4: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "78:" // Height 4: input setup done + "cmp x15, #0x4\n" + "blt 81f\n" + "cmp x15, #0x8\n" + "blt 80f\n" + "79:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x7, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x7, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "add x12, x12, #0x10\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "add x10, x10, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "cmp x15, #0x8\n" + "add x7, x7, #0x40\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "bge 79b\n" + "80:" // Height 4: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q14, [x7, #0x20]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "ldr q15, [x7, #0x30]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "add x12, x12, #0x10\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v25.4s, v13.4s, v1.s[1]\n" + "add x10, x10, #0x10\n" + "fmla v26.4s, v13.4s, v2.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v27.4s, v13.4s, v3.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "fmla v25.4s, v14.4s, v1.s[2]\n" + "fmla v26.4s, v14.4s, v2.s[2]\n" + "fmla v27.4s, v14.4s, v3.s[2]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "fmla v25.4s, v15.4s, v1.s[3]\n" + "fmla v26.4s, v15.4s, v2.s[3]\n" + "fmla v27.4s, v15.4s, v3.s[3]\n" + "81:" // Height 4: Multiply loop: Main loop skip + "cbz x15, 83f\n" + "82:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr s1, [x12], #0x4\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x28], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "fmla v25.4s, v16.4s, v1.s[0]\n" + "add x7, x7, #0x10\n" + "fmla v26.4s, v16.4s, v2.s[0]\n" + "fmla v27.4s, v16.4s, v3.s[0]\n" + "cbnz x15, 82b\n" + "83:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 76b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 84f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "84:" // Height 4: No activation + "cmp x6, #0x4\n" + "bge 87f\n" + "tbz x6, #1, 85f\n" + "str d24, [x17], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x11], #0x8\n" + "str d27, [x9], #0x8\n" + "tbz x6, #0, 86f\n" + "st1 { v24.s }[2], [x17]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x11]\n" + "st1 { v27.s }[2], [x9]\n" + "b 86f\n" + "85:" // Height 4: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x11, #0x0]\n" + "str s27, [x9, #0x0]\n" + "86:" // Height 4: Partial direct writeback: Done + "b 88f\n" + "87:" // Height 4: Full writeback + "str q24, [x17, #0x0]\n" + "str q25, [x13, #0x0]\n" + "str q26, [x11, #0x0]\n" + "str q27, [x9, #0x0]\n" + "add x17, x17, #0x10\n" + "add x13, x13, #0x10\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + "88:" // Height 4: Writeback done + "subs x6, x6, #0x4\n" + "bgt 69b\n" + "b 178f\n" + "89:" // Height 5 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 90f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 91f\n" + "90:" // Height 5: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "91:" // Height 5: Column loop + "cbz x8, 92f\n" + "ldr q24, [x8, #0x0]\n" + "mov v25.16b, v24.16b\n" + "add x8, x8, #0x10\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "b 97f\n" + "92:" // Height 5: no bias + "tbz %x[flags], #0, 96f\n" + "cmp x6, #0x4\n" + "bge 95f\n" + "tbz x6, #1, 93f\n" + "ldr d24, [x17], #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x11], #0x8\n" + "ldr d27, [x9], #0x8\n" + "ldr d28, [x27], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 94f\n" + "ld1 { v24.s }[2], [x17]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x11]\n" + "ld1 { v27.s }[2], [x9]\n" + "ld1 { v28.s }[2], [x27]\n" + "b 94f\n" + "93:" // Height 5: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x11, #0x0]\n" + "ldr s27, [x9, #0x0]\n" + "ldr s28, [x27, #0x0]\n" + "94:" // Height 5: Partial accumulate: Done + "sub x17, x17, x19\n" + "sub x13, x13, x19\n" + "sub x11, x11, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "b 97f\n" + "95:" // Height 5: full accumulate + "ldr q24, [x17, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x11, #0x0]\n" + "ldr q27, [x9, #0x0]\n" + "ldr q28, [x27, #0x0]\n" + "b 97f\n" + "96:" // Height 5: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "97:" // Height 5: setup done + "mov x16, #0x0\n" + "98:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 99f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "cbnz x16, 100f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 100f\n" + "99:" // Height 5: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "100:" // Height 5: input setup done + "cmp x15, #0x4\n" + "blt 103f\n" + "cmp x15, #0x8\n" + "blt 102f\n" + "101:" // Height 5: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x7, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x7, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "cmp x15, #0x8\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x7, x7, #0x40\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "bge 101b\n" + "102:" // Height 5: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q14, [x7, #0x20]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "ldr q15, [x7, #0x30]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v25.4s, v13.4s, v1.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v26.4s, v13.4s, v2.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v27.4s, v13.4s, v3.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v28.4s, v13.4s, v4.s[1]\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "fmla v25.4s, v14.4s, v1.s[2]\n" + "fmla v26.4s, v14.4s, v2.s[2]\n" + "fmla v27.4s, v14.4s, v3.s[2]\n" + "fmla v28.4s, v14.4s, v4.s[2]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "fmla v25.4s, v15.4s, v1.s[3]\n" + "fmla v26.4s, v15.4s, v2.s[3]\n" + "fmla v27.4s, v15.4s, v3.s[3]\n" + "fmla v28.4s, v15.4s, v4.s[3]\n" + "103:" // Height 5: Multiply loop: Main loop skip + "cbz x15, 105f\n" + "104:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr s1, [x12], #0x4\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x28], #0x4\n" + "ldr s4, [x26], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "fmla v25.4s, v16.4s, v1.s[0]\n" + "add x7, x7, #0x10\n" + "fmla v26.4s, v16.4s, v2.s[0]\n" + "fmla v27.4s, v16.4s, v3.s[0]\n" + "fmla v28.4s, v16.4s, v4.s[0]\n" + "cbnz x15, 104b\n" + "105:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 98b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 106f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "106:" // Height 5: No activation + "cmp x6, #0x4\n" + "bge 109f\n" + "tbz x6, #1, 107f\n" + "str d24, [x17], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x11], #0x8\n" + "str d27, [x9], #0x8\n" + "str d28, [x27], #0x8\n" + "tbz x6, #0, 108f\n" + "st1 { v24.s }[2], [x17]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x11]\n" + "st1 { v27.s }[2], [x9]\n" + "st1 { v28.s }[2], [x27]\n" + "b 108f\n" + "107:" // Height 5: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x11, #0x0]\n" + "str s27, [x9, #0x0]\n" + "str s28, [x27, #0x0]\n" + "108:" // Height 5: Partial direct writeback: Done + "b 110f\n" + "109:" // Height 5: Full writeback + "str q24, [x17, #0x0]\n" + "str q25, [x13, #0x0]\n" + "str q26, [x11, #0x0]\n" + "str q27, [x9, #0x0]\n" + "str q28, [x27, #0x0]\n" + "add x17, x17, #0x10\n" + "add x13, x13, #0x10\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "110:" // Height 5: Writeback done + "subs x6, x6, #0x4\n" + "bgt 91b\n" + "b 178f\n" + "111:" // Height 6 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 112f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x28]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 113f\n" + "112:" // Height 6: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "113:" // Height 6: Column loop + "cbz x8, 114f\n" + "ldr q24, [x8, #0x0]\n" + "mov v25.16b, v24.16b\n" + "add x8, x8, #0x10\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "mov v29.16b, v24.16b\n" + "b 119f\n" + "114:" // Height 6: no bias + "tbz %x[flags], #0, 118f\n" + "cmp x6, #0x4\n" + "bge 117f\n" + "tbz x6, #1, 115f\n" + "ldr d24, [x17], #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x11], #0x8\n" + "ldr d27, [x9], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d29, [x25], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 116f\n" + "ld1 { v24.s }[2], [x17]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x11]\n" + "ld1 { v27.s }[2], [x9]\n" + "ld1 { v28.s }[2], [x27]\n" + "ld1 { v29.s }[2], [x25]\n" + "b 116f\n" + "115:" // Height 6: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x11, #0x0]\n" + "ldr s27, [x9, #0x0]\n" + "ldr s28, [x27, #0x0]\n" + "ldr s29, [x25, #0x0]\n" + "116:" // Height 6: Partial accumulate: Done + "sub x17, x17, x19\n" + "sub x13, x13, x19\n" + "sub x11, x11, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "b 119f\n" + "117:" // Height 6: full accumulate + "ldr q24, [x17, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x11, #0x0]\n" + "ldr q27, [x9, #0x0]\n" + "ldr q28, [x27, #0x0]\n" + "ldr q29, [x25, #0x0]\n" + "b 119f\n" + "118:" // Height 6: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "119:" // Height 6: setup done + "mov x16, #0x0\n" + "120:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 121f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "ldr x24, [x20, #0x28]\n" + "cbnz x16, 122f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 122f\n" + "121:" // Height 6: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "122:" // Height 6: input setup done + "cmp x15, #0x4\n" + "blt 125f\n" + "cmp x15, #0x8\n" + "blt 124f\n" + "123:" // Height 6: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q5, [x24, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x7, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x7, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "cmp x15, #0x8\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "add x7, x7, #0x40\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "bge 123b\n" + "124:" // Height 6: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q5, [x24, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q14, [x7, #0x20]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "ldr q15, [x7, #0x30]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v12.4s, v5.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v25.4s, v13.4s, v1.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v26.4s, v13.4s, v2.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v27.4s, v13.4s, v3.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v28.4s, v13.4s, v4.s[1]\n" + "fmla v29.4s, v13.4s, v5.s[1]\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "fmla v25.4s, v14.4s, v1.s[2]\n" + "fmla v26.4s, v14.4s, v2.s[2]\n" + "fmla v27.4s, v14.4s, v3.s[2]\n" + "fmla v28.4s, v14.4s, v4.s[2]\n" + "fmla v29.4s, v14.4s, v5.s[2]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "fmla v25.4s, v15.4s, v1.s[3]\n" + "fmla v26.4s, v15.4s, v2.s[3]\n" + "fmla v27.4s, v15.4s, v3.s[3]\n" + "fmla v28.4s, v15.4s, v4.s[3]\n" + "fmla v29.4s, v15.4s, v5.s[3]\n" + "125:" // Height 6: Multiply loop: Main loop skip + "cbz x15, 127f\n" + "126:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr s1, [x12], #0x4\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x28], #0x4\n" + "ldr s4, [x26], #0x4\n" + "ldr s5, [x24], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "fmla v25.4s, v16.4s, v1.s[0]\n" + "add x7, x7, #0x10\n" + "fmla v26.4s, v16.4s, v2.s[0]\n" + "fmla v27.4s, v16.4s, v3.s[0]\n" + "fmla v28.4s, v16.4s, v4.s[0]\n" + "fmla v29.4s, v16.4s, v5.s[0]\n" + "cbnz x15, 126b\n" + "127:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 120b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 128f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "128:" // Height 6: No activation + "cmp x6, #0x4\n" + "bge 131f\n" + "tbz x6, #1, 129f\n" + "str d24, [x17], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x11], #0x8\n" + "str d27, [x9], #0x8\n" + "str d28, [x27], #0x8\n" + "str d29, [x25], #0x8\n" + "tbz x6, #0, 130f\n" + "st1 { v24.s }[2], [x17]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x11]\n" + "st1 { v27.s }[2], [x9]\n" + "st1 { v28.s }[2], [x27]\n" + "st1 { v29.s }[2], [x25]\n" + "b 130f\n" + "129:" // Height 6: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x11, #0x0]\n" + "str s27, [x9, #0x0]\n" + "str s28, [x27, #0x0]\n" + "str s29, [x25, #0x0]\n" + "130:" // Height 6: Partial direct writeback: Done + "b 132f\n" + "131:" // Height 6: Full writeback + "str q24, [x17, #0x0]\n" + "str q25, [x13, #0x0]\n" + "str q26, [x11, #0x0]\n" + "str q27, [x9, #0x0]\n" + "str q28, [x27, #0x0]\n" + "str q29, [x25, #0x0]\n" + "add x17, x17, #0x10\n" + "add x13, x13, #0x10\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "add x25, x25, #0x10\n" + "132:" // Height 6: Writeback done + "subs x6, x6, #0x4\n" + "bgt 113b\n" + "b 178f\n" + "133:" // Height 7 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 134f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x28]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x23, [%x[output_ptr], #0x30]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 135f\n" + "134:" // Height 7: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "135:" // Height 7: Column loop + "cbz x8, 136f\n" + "ldr q24, [x8, #0x0]\n" + "mov v25.16b, v24.16b\n" + "add x8, x8, #0x10\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "mov v29.16b, v24.16b\n" + "mov v30.16b, v24.16b\n" + "b 141f\n" + "136:" // Height 7: no bias + "tbz %x[flags], #0, 140f\n" + "cmp x6, #0x4\n" + "bge 139f\n" + "tbz x6, #1, 137f\n" + "ldr d24, [x17], #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x11], #0x8\n" + "ldr d27, [x9], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d29, [x25], #0x8\n" + "ldr d30, [x23], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 138f\n" + "ld1 { v24.s }[2], [x17]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x11]\n" + "ld1 { v27.s }[2], [x9]\n" + "ld1 { v28.s }[2], [x27]\n" + "ld1 { v29.s }[2], [x25]\n" + "ld1 { v30.s }[2], [x23]\n" + "b 138f\n" + "137:" // Height 7: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x11, #0x0]\n" + "ldr s27, [x9, #0x0]\n" + "ldr s28, [x27, #0x0]\n" + "ldr s29, [x25, #0x0]\n" + "ldr s30, [x23, #0x0]\n" + "138:" // Height 7: Partial accumulate: Done + "sub x17, x17, x19\n" + "sub x13, x13, x19\n" + "sub x11, x11, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "b 141f\n" + "139:" // Height 7: full accumulate + "ldr q24, [x17, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x11, #0x0]\n" + "ldr q27, [x9, #0x0]\n" + "ldr q28, [x27, #0x0]\n" + "ldr q29, [x25, #0x0]\n" + "ldr q30, [x23, #0x0]\n" + "b 141f\n" + "140:" // Height 7: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "141:" // Height 7: setup done + "mov x16, #0x0\n" + "142:" // Height 7: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 143f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "ldr x24, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "cbnz x16, 144f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 144f\n" + "143:" // Height 7: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "144:" // Height 7: input setup done + "cmp x15, #0x4\n" + "blt 147f\n" + "cmp x15, #0x8\n" + "blt 146f\n" + "145:" // Height 7: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q5, [x24, #0x0]\n" + "ldr q6, [x22, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x7, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x7, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "cmp x15, #0x8\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "add x7, x7, #0x40\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "bge 145b\n" + "146:" // Height 7: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q5, [x24, #0x0]\n" + "ldr q6, [x22, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q14, [x7, #0x20]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "ldr q15, [x7, #0x30]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v12.4s, v5.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v30.4s, v12.4s, v6.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v25.4s, v13.4s, v1.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v26.4s, v13.4s, v2.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla v27.4s, v13.4s, v3.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v28.4s, v13.4s, v4.s[1]\n" + "fmla v29.4s, v13.4s, v5.s[1]\n" + "fmla v30.4s, v13.4s, v6.s[1]\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "fmla v25.4s, v14.4s, v1.s[2]\n" + "fmla v26.4s, v14.4s, v2.s[2]\n" + "fmla v27.4s, v14.4s, v3.s[2]\n" + "fmla v28.4s, v14.4s, v4.s[2]\n" + "fmla v29.4s, v14.4s, v5.s[2]\n" + "fmla v30.4s, v14.4s, v6.s[2]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "fmla v25.4s, v15.4s, v1.s[3]\n" + "fmla v26.4s, v15.4s, v2.s[3]\n" + "fmla v27.4s, v15.4s, v3.s[3]\n" + "fmla v28.4s, v15.4s, v4.s[3]\n" + "fmla v29.4s, v15.4s, v5.s[3]\n" + "fmla v30.4s, v15.4s, v6.s[3]\n" + "147:" // Height 7: Multiply loop: Main loop skip + "cbz x15, 149f\n" + "148:" // Height 7: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr s1, [x12], #0x4\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x28], #0x4\n" + "ldr s4, [x26], #0x4\n" + "ldr s5, [x24], #0x4\n" + "ldr s6, [x22], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "fmla v25.4s, v16.4s, v1.s[0]\n" + "add x7, x7, #0x10\n" + "fmla v26.4s, v16.4s, v2.s[0]\n" + "fmla v27.4s, v16.4s, v3.s[0]\n" + "fmla v28.4s, v16.4s, v4.s[0]\n" + "fmla v29.4s, v16.4s, v5.s[0]\n" + "fmla v30.4s, v16.4s, v6.s[0]\n" + "cbnz x15, 148b\n" + "149:" // Height 7: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 142b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 150f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmin v30.4s, v30.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "fmax v30.4s, v30.4s, v17.4s\n" + "150:" // Height 7: No activation + "cmp x6, #0x4\n" + "bge 153f\n" + "tbz x6, #1, 151f\n" + "str d24, [x17], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x11], #0x8\n" + "str d27, [x9], #0x8\n" + "str d28, [x27], #0x8\n" + "str d29, [x25], #0x8\n" + "str d30, [x23], #0x8\n" + "tbz x6, #0, 152f\n" + "st1 { v24.s }[2], [x17]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x11]\n" + "st1 { v27.s }[2], [x9]\n" + "st1 { v28.s }[2], [x27]\n" + "st1 { v29.s }[2], [x25]\n" + "st1 { v30.s }[2], [x23]\n" + "b 152f\n" + "151:" // Height 7: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x11, #0x0]\n" + "str s27, [x9, #0x0]\n" + "str s28, [x27, #0x0]\n" + "str s29, [x25, #0x0]\n" + "str s30, [x23, #0x0]\n" + "152:" // Height 7: Partial direct writeback: Done + "b 154f\n" + "153:" // Height 7: Full writeback + "str q24, [x17, #0x0]\n" + "str q25, [x13, #0x0]\n" + "str q26, [x11, #0x0]\n" + "str q27, [x9, #0x0]\n" + "str q28, [x27, #0x0]\n" + "str q29, [x25, #0x0]\n" + "str q30, [x23, #0x0]\n" + "add x17, x17, #0x10\n" + "add x13, x13, #0x10\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "154:" // Height 7: Writeback done + "subs x6, x6, #0x4\n" + "bgt 135b\n" + "b 178f\n" + "155:" // Height 8 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 156f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x28]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x23, [%x[output_ptr], #0x30]\n" + "ldr x21, [%x[output_ptr], #0x38]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add %x[output_ptr], %x[output_ptr], #0x40\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 157f\n" + "156:" // Height 8: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "157:" // Height 8: Column loop + "cbz x8, 158f\n" + "ldr q24, [x8, #0x0]\n" + "mov v25.16b, v24.16b\n" + "add x8, x8, #0x10\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "mov v29.16b, v24.16b\n" + "mov v30.16b, v24.16b\n" + "mov v31.16b, v24.16b\n" + "b 163f\n" + "158:" // Height 8: no bias + "tbz %x[flags], #0, 162f\n" + "cmp x6, #0x4\n" + "bge 161f\n" + "tbz x6, #1, 159f\n" + "ldr d24, [x17], #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x11], #0x8\n" + "ldr d27, [x9], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d29, [x25], #0x8\n" + "ldr d30, [x23], #0x8\n" + "ldr d31, [x21], #0x8\n" + "mov x19, #0x8\n" + "tbz x6, #0, 160f\n" + "ld1 { v24.s }[2], [x17]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x11]\n" + "ld1 { v27.s }[2], [x9]\n" + "ld1 { v28.s }[2], [x27]\n" + "ld1 { v29.s }[2], [x25]\n" + "ld1 { v30.s }[2], [x23]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 160f\n" + "159:" // Height 8: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s24, [x17, #0x0]\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x11, #0x0]\n" + "ldr s27, [x9, #0x0]\n" + "ldr s28, [x27, #0x0]\n" + "ldr s29, [x25, #0x0]\n" + "ldr s30, [x23, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "160:" // Height 8: Partial accumulate: Done + "sub x17, x17, x19\n" + "sub x13, x13, x19\n" + "sub x11, x11, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "sub x21, x21, x19\n" + "b 163f\n" + "161:" // Height 8: full accumulate + "ldr q24, [x17, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x11, #0x0]\n" + "ldr q27, [x9, #0x0]\n" + "ldr q28, [x27, #0x0]\n" + "ldr q29, [x25, #0x0]\n" + "ldr q30, [x23, #0x0]\n" + "ldr q31, [x21, #0x0]\n" + "b 163f\n" + "162:" // Height 8: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "163:" // Height 8: setup done + "mov x16, #0x0\n" + "164:" // Height 8: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 165f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "ldr x24, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "ldr x20, [x20, #0x38]\n" + "cbnz x16, 166f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 166f\n" + "165:" // Height 8: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "add x20, x22, x19, LSL #2\n" + "166:" // Height 8: input setup done + "cmp x15, #0x4\n" + "blt 169f\n" + "cmp x15, #0x8\n" + "blt 168f\n" + "167:" // Height 8: Multiply loop: Main loop head + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q5, [x24, #0x0]\n" + "ldr q6, [x22, #0x0]\n" + "ldr q7, [x20, #0x0]\n" + "ldr q8, [x7, #0x0]\n" + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x7, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x7, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x7, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v31.4s, v8.4s, v7.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x20, x20, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x15, x15, #0x4\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "cmp x15, #0x8\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "add x7, x7, #0x40\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "fmla v31.4s, v9.4s, v7.s[1]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "fmla v31.4s, v10.4s, v7.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "bge 167b\n" + "168:" // Height 8: Multiply loop: Single iteration only + "sub x15, x15, #0x4\n" + "ldr q0, [x14, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q4, [x26, #0x0]\n" + "ldr q5, [x24, #0x0]\n" + "ldr q6, [x22, #0x0]\n" + "ldr q7, [x20, #0x0]\n" + "ldr q12, [x7, #0x0]\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "ldr q13, [x7, #0x10]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q14, [x7, #0x20]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "ldr q15, [x7, #0x30]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "add x14, x14, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v12.4s, v5.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x10, x10, #0x10\n" + "fmla v30.4s, v12.4s, v6.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v31.4s, v12.4s, v7.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla v24.4s, v13.4s, v0.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v25.4s, v13.4s, v1.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla v26.4s, v13.4s, v2.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x20, x20, #0x10\n" + "fmla v27.4s, v13.4s, v3.s[1]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "add x7, x7, #0x40\n" + "fmla v28.4s, v13.4s, v4.s[1]\n" + "fmla v29.4s, v13.4s, v5.s[1]\n" + "fmla v30.4s, v13.4s, v6.s[1]\n" + "fmla v31.4s, v13.4s, v7.s[1]\n" + "fmla v24.4s, v14.4s, v0.s[2]\n" + "fmla v25.4s, v14.4s, v1.s[2]\n" + "fmla v26.4s, v14.4s, v2.s[2]\n" + "fmla v27.4s, v14.4s, v3.s[2]\n" + "fmla v28.4s, v14.4s, v4.s[2]\n" + "fmla v29.4s, v14.4s, v5.s[2]\n" + "fmla v30.4s, v14.4s, v6.s[2]\n" + "fmla v31.4s, v14.4s, v7.s[2]\n" + "fmla v24.4s, v15.4s, v0.s[3]\n" + "fmla v25.4s, v15.4s, v1.s[3]\n" + "fmla v26.4s, v15.4s, v2.s[3]\n" + "fmla v27.4s, v15.4s, v3.s[3]\n" + "fmla v28.4s, v15.4s, v4.s[3]\n" + "fmla v29.4s, v15.4s, v5.s[3]\n" + "fmla v30.4s, v15.4s, v6.s[3]\n" + "fmla v31.4s, v15.4s, v7.s[3]\n" + "169:" // Height 8: Multiply loop: Main loop skip + "cbz x15, 171f\n" + "170:" // Height 8: Multiply loop: Odd block loop + "ldr s0, [x14], #0x4\n" + "ldr s1, [x12], #0x4\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x28], #0x4\n" + "ldr s4, [x26], #0x4\n" + "ldr s5, [x24], #0x4\n" + "ldr s6, [x22], #0x4\n" + "ldr s7, [x20], #0x4\n" + "ldr q16, [x7, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "sub x15, x15, #0x1\n" + "fmla v25.4s, v16.4s, v1.s[0]\n" + "add x7, x7, #0x10\n" + "fmla v26.4s, v16.4s, v2.s[0]\n" + "fmla v27.4s, v16.4s, v3.s[0]\n" + "fmla v28.4s, v16.4s, v4.s[0]\n" + "fmla v29.4s, v16.4s, v5.s[0]\n" + "fmla v30.4s, v16.4s, v6.s[0]\n" + "fmla v31.4s, v16.4s, v7.s[0]\n" + "cbnz x15, 170b\n" + "171:" // Height 8: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x16, x16, #0x1\n" + "cmp x16, x19\n" + "bne 164b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 172f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmin v30.4s, v30.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "fmax v30.4s, v30.4s, v17.4s\n" + "fmin v31.4s, v31.4s, v16.4s\n" + "fmax v31.4s, v31.4s, v17.4s\n" + "172:" // Height 8: No activation + "cmp x6, #0x4\n" + "bge 175f\n" + "tbz x6, #1, 173f\n" + "str d24, [x17], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x11], #0x8\n" + "str d27, [x9], #0x8\n" + "str d28, [x27], #0x8\n" + "str d29, [x25], #0x8\n" + "str d30, [x23], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x6, #0, 174f\n" + "st1 { v24.s }[2], [x17]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x11]\n" + "st1 { v27.s }[2], [x9]\n" + "st1 { v28.s }[2], [x27]\n" + "st1 { v29.s }[2], [x25]\n" + "st1 { v30.s }[2], [x23]\n" + "st1 { v31.s }[2], [x21]\n" + "b 174f\n" + "173:" // Height 8: Partial direct writeback: partial_1_0 + "str s24, [x17, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x11, #0x0]\n" + "str s27, [x9, #0x0]\n" + "str s28, [x27, #0x0]\n" + "str s29, [x25, #0x0]\n" + "str s30, [x23, #0x0]\n" + "str s31, [x21, #0x0]\n" + "174:" // Height 8: Partial direct writeback: Done + "b 176f\n" + "175:" // Height 8: Full writeback + "str q24, [x17, #0x0]\n" + "str q25, [x13, #0x0]\n" + "str q26, [x11, #0x0]\n" + "str q27, [x9, #0x0]\n" + "str q28, [x27, #0x0]\n" + "str q29, [x25, #0x0]\n" + "str q30, [x23, #0x0]\n" + "str q31, [x21, #0x0]\n" + "add x17, x17, #0x10\n" + "add x13, x13, #0x10\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "add x21, x21, #0x10\n" + "176:" // Height 8: Writeback done + "subs x6, x6, #0x4\n" + "bgt 157b\n" + "subs %x[M], %x[M], #0x8\n" + "beq 178f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 177f\n" + "add x20, x20, #0x8\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "177:" // Update direct input + "mov x19, #0x20\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "178:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp new file mode 100644 index 0000000000..4bb7a1e0eb --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_s8qa_dot_4x16( ARGLIST ); + +class cls_a64_hybrid_s8qa_dot_4x16 +{ +public: + typedef int8_t operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8qa_dot_4x16; + + cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp new file mode 100644 index 0000000000..3fb365bc1e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp @@ -0,0 +1,2072 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_s8qa_dot_4x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 94f\n" + "cmp %x[M], #0x2\n" + "bgt 63f\n" + "beq 32f\n" + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[col_bias]\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 2f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "add x9, x9, x19\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x9, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "4:" // Height 1: setup done + "mov x28, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "cbnz x28, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x27, #0x10\n" + "blt 12f\n" + "cmp x27, #0x20\n" + "blt 10f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + "ldr q6, [x11, #0x20]\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x30]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q8, [x11, #0x40]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q5, [x11, #0x80]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q10, [x11, #0xd0]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + "add x11, x11, #0x100\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 9f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "9:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 8b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + "ldr q8, [x11, #0x20]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x30]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x40]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" + "ldr q7, [x11, #0x80]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" + "ldr q5, [x11, #0xd0]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" + "add x11, x11, #0x100\n" + ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 11f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "11:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "12:" // Height 1: Multiply loop: Main loop skip + "cbz x27, 19f\n" + "cmp x27, #0x4\n" + "blt 15f\n" + "13:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "tbnz %x[flags], #31, 14f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "14:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + "ldr q10, [x11, #0x20]\n" + ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q4, [x11, #0x30]\n" + ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "sub x27, x27, #0x4\n" + "add x11, x11, #0x40\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + "cmp x27, #0x4\n" + "bge 13b\n" + "cbz x27, 19f\n" + "15:" // Height 1: Multiply loop: Skip odd blocks + "tbz x27, #1, 16f\n" + "ldr h0, [x26], #0x2\n" + "tbz x27, #0, 17f\n" + "ld1 { v0.b }[2], [x26]\n" + "b 17f\n" + "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "17:" // Height 1: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 18f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "18:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + "ldr q7, [x11, #0x20]\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + "ldr q8, [x11, #0x30]\n" + ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" + "19:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 5b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbnz %x[flags], #31, 20f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "addp v11.4s, v11.4s, v11.4s\n" + "ld1r { v1.4s }, [x19]\n" + "neg v1.4s, v1.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" + "20:" // Height 1: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "ldr q3, [x10, #0x30]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add v17.4s, v17.4s, v1.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add v18.4s, v18.4s, v2.4s\n" + "ld1r { v4.4s }, [x19]\n" + "add x10, x10, #0x40\n" + "add v19.4s, v19.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 21f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "21:" // Height 1: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 30f\n" + "tbz x12, #3, 25f\n" + "str d16, [x9], #0x8\n" + "tbz x12, #2, 23f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "tbz x12, #1, 22f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[14], [x9]\n" + "b 29f\n" + "22:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x12, #0, 29f\n" + "st1 { v16.b }[12], [x9]\n" + "b 29f\n" + "23:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x12, #1, 24f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[10], [x9]\n" + "b 29f\n" + "24:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x12, #0, 29f\n" + "st1 { v16.b }[8], [x9]\n" + "b 29f\n" + "25:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x12, #2, 27f\n" + "str s16, [x9], #0x4\n" + "tbz x12, #1, 26f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[6], [x9]\n" + "b 29f\n" + "26:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x12, #0, 29f\n" + "st1 { v16.b }[4], [x9]\n" + "b 29f\n" + "27:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x12, #1, 28f\n" + "str h16, [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[2], [x9]\n" + "b 29f\n" + "28:" // Height 1: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "29:" // Height 1: Partial direct writeback: Done + "b 31f\n" + "30:" // Height 1: Full writeback + "str q16, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "31:" // Height 1: Writeback done + "subs x12, x12, #0x10\n" + "bgt 3b\n" + "b 126f\n" + "32:" // Height 2 + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 33f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "add x25, x25, x19\n" + "b 34f\n" + "33:" // Height 2: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "34:" // Height 2: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "35:" // Height 2: setup done + "mov x28, #0x0\n" + "36:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 37f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x28, 38f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 38f\n" + "37:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "38:" // Height 2: input setup done + "cmp x27, #0x10\n" + "blt 43f\n" + "cmp x27, #0x20\n" + "blt 41f\n" + "39:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x20]\n" + "ldr q7, [x11, #0x30]\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q8, [x11, #0x40]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x80]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + "add x26, x26, #0x10\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + "add x24, x24, #0x10\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + "ldr q10, [x11, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "40:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x20\n" + "bge 39b\n" + "41:" // Height 2: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x20]\n" + "ldr q9, [x11, #0x30]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x40]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x80]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" + "add x26, x26, #0x10\n" + ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" + "add x24, x24, #0x10\n" + ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n" + "ldr q5, [x11, #0xd0]\n" + ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" + ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n" + ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n" + ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 42f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "42:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "43:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 50f\n" + "cmp x27, #0x4\n" + "blt 46f\n" + "44:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x24], #0x4\n" + "tbnz %x[flags], #31, 45f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "45:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x20]\n" + "ldr q4, [x11, #0x30]\n" + ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "sub x27, x27, #0x4\n" + ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "cmp x27, #0x4\n" + ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" + "bge 44b\n" + "cbz x27, 50f\n" + "46:" // Height 2: Multiply loop: Skip odd blocks + "tbz x27, #1, 47f\n" + "ldr h0, [x26], #0x2\n" + "ldr h1, [x24], #0x2\n" + "tbz x27, #0, 48f\n" + "ld1 { v0.b }[2], [x26]\n" + "ld1 { v1.b }[2], [x24]\n" + "b 48f\n" + "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "48:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 49f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "49:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" + "ldr q7, [x11, #0x20]\n" + "ldr q8, [x11, #0x30]\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n" + "50:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 36b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbnz %x[flags], #31, 51f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x19]\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "neg v2.4s, v2.4s\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "51:" // Height 2: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "add v20.4s, v20.4s, v12.4s\n" + "ldr q3, [x10, #0x30]\n" + "add v21.4s, v21.4s, v12.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v22.4s, v22.4s, v12.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "add v23.4s, v23.4s, v12.4s\n" + "add x10, x10, #0x40\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "tbz %x[flags], #5, 52f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "and v9.16b, v21.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "52:" // Height 2: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 61f\n" + "tbz x12, #3, 56f\n" + "str d16, [x9], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x12, #2, 54f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "tbz x12, #1, 53f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[14], [x9]\n" + "st1 { v20.b }[14], [x25]\n" + "b 60f\n" + "53:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x12, #0, 60f\n" + "st1 { v16.b }[12], [x9]\n" + "st1 { v20.b }[12], [x25]\n" + "b 60f\n" + "54:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x12, #1, 55f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[10], [x9]\n" + "st1 { v20.b }[10], [x25]\n" + "b 60f\n" + "55:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x12, #0, 60f\n" + "st1 { v16.b }[8], [x9]\n" + "st1 { v20.b }[8], [x25]\n" + "b 60f\n" + "56:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x12, #2, 58f\n" + "str s16, [x9], #0x4\n" + "str s20, [x25], #0x4\n" + "tbz x12, #1, 57f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[6], [x9]\n" + "st1 { v20.b }[6], [x25]\n" + "b 60f\n" + "57:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x12, #0, 60f\n" + "st1 { v16.b }[4], [x9]\n" + "st1 { v20.b }[4], [x25]\n" + "b 60f\n" + "58:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x12, #1, 59f\n" + "str h16, [x9], #0x2\n" + "str h20, [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[2], [x9]\n" + "st1 { v20.b }[2], [x25]\n" + "b 60f\n" + "59:" // Height 2: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "str b20, [x25, #0x0]\n" + "60:" // Height 2: Partial direct writeback: Done + "b 62f\n" + "61:" // Height 2: Full writeback + "str q16, [x9, #0x0]\n" + "str q20, [x25, #0x0]\n" + "add x9, x9, #0x10\n" + "add x25, x25, #0x10\n" + "62:" // Height 2: Writeback done + "subs x12, x12, #0x10\n" + "bgt 34b\n" + "b 126f\n" + "63:" // Height 3 + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 64f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "b 65f\n" + "64:" // Height 3: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "65:" // Height 3: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "66:" // Height 3: setup done + "mov x28, #0x0\n" + "67:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 68f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "cbnz x28, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 69f\n" + "68:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "69:" // Height 3: input setup done + "cmp x27, #0x10\n" + "blt 74f\n" + "cmp x27, #0x20\n" + "blt 72f\n" + "70:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x20]\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x11, #0x30]\n" + "ldr q8, [x11, #0x40]\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x80]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + "ldr q10, [x11, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 71f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "71:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x20\n" + "prfm pldl1keep, [x22, #0x80]\n" + "bge 70b\n" + "72:" // Height 3: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x20]\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + "ldr q9, [x11, #0x30]\n" + "ldr q10, [x11, #0x40]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x80]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n" + "ldr q5, [x11, #0xd0]\n" + ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" + ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n" + ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n" + ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n" + ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n" + ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 73f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "73:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "74:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 81f\n" + "cmp x27, #0x4\n" + "blt 77f\n" + "75:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x22], #0x4\n" + "tbnz %x[flags], #31, 76f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "76:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x20]\n" + ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q4, [x11, #0x30]\n" + "sub x27, x27, #0x4\n" + ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "cmp x27, #0x4\n" + ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" + "bge 75b\n" + "cbz x27, 81f\n" + "77:" // Height 3: Multiply loop: Skip odd blocks + "tbz x27, #1, 78f\n" + "ldr h0, [x26], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x22], #0x2\n" + "tbz x27, #0, 79f\n" + "ld1 { v0.b }[2], [x26]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x22]\n" + "b 79f\n" + "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "79:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 80f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "80:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" + "ldr q7, [x11, #0x20]\n" + ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n" + "ldr q8, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n" + "81:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 67b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbnz %x[flags], #31, 82f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x19]\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "neg v3.4s, v3.4s\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "82:" // Height 3: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "add v20.4s, v20.4s, v12.4s\n" + "ldr q3, [x10, #0x30]\n" + "add v21.4s, v21.4s, v12.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v22.4s, v22.4s, v12.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "add v23.4s, v23.4s, v12.4s\n" + "add x10, x10, #0x40\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 83f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "and v9.16b, v21.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "and v7.16b, v26.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v8.16b, v27.16b, v0.16b\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "83:" // Height 3: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 92f\n" + "tbz x12, #3, 87f\n" + "str d16, [x9], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x12, #2, 85f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "tbz x12, #1, 84f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[14], [x9]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x23]\n" + "b 91f\n" + "84:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x12, #0, 91f\n" + "st1 { v16.b }[12], [x9]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x23]\n" + "b 91f\n" + "85:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x12, #1, 86f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[10], [x9]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x23]\n" + "b 91f\n" + "86:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x12, #0, 91f\n" + "st1 { v16.b }[8], [x9]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x23]\n" + "b 91f\n" + "87:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x12, #2, 89f\n" + "str s16, [x9], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "tbz x12, #1, 88f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[6], [x9]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x23]\n" + "b 91f\n" + "88:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x12, #0, 91f\n" + "st1 { v16.b }[4], [x9]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x23]\n" + "b 91f\n" + "89:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x12, #1, 90f\n" + "str h16, [x9], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[2], [x9]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x23]\n" + "b 91f\n" + "90:" // Height 3: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x23, #0x0]\n" + "91:" // Height 3: Partial direct writeback: Done + "b 93f\n" + "92:" // Height 3: Full writeback + "str q16, [x9, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x23, #0x0]\n" + "add x9, x9, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "93:" // Height 3: Writeback done + "subs x12, x12, #0x10\n" + "bgt 65b\n" + "b 126f\n" + "94:" // Height 4 + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 95f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "ldr x21, [%x[output_ptr], #0x18]\n" + "add x25, x25, x19\n" + "add %x[output_ptr], %x[output_ptr], #0x20\n" + "add x23, x23, x19\n" + "add x21, x21, x19\n" + "b 96f\n" + "95:" // Height 4: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "add x21, x23, x19\n" + "add %x[output_ptr], x21, x19\n" + "96:" // Height 4: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "97:" // Height 4: setup done + "mov x28, #0x0\n" + "98:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 99f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x28, 100f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 100f\n" + "99:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "100:" // Height 4: input setup done + "cmp x27, #0x10\n" + "blt 105f\n" + "cmp x27, #0x20\n" + "blt 103f\n" + "101:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x20]\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x11, #0x30]\n" + ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" + "ldr q8, [x11, #0x40]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" + "ldr q5, [x11, #0x80]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" + "ldr q10, [x11, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 102f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "102:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x20\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bge 101b\n" + "103:" // Height 4: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x20]\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + "ldr q9, [x11, #0x30]\n" + ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n" + "ldr q10, [x11, #0x40]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x11, #0x80]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n" + ".inst 0x4fa3e15c // sdot v28.4s, v10.16b, v3.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n" + ".inst 0x4fa3e09d // sdot v29.4s, v4.16b, v3.4b[1]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n" + ".inst 0x4fa3e0be // sdot v30.4s, v5.16b, v3.4b[1]\n" + "ldr q5, [x11, #0xd0]\n" + ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0df // sdot v31.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8fc // sdot v28.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f83e91d // sdot v29.4s, v8.16b, v3.4b[2]\n" + ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" + ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n" + ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n" + ".inst 0x4f83e93e // sdot v30.4s, v9.16b, v3.4b[2]\n" + ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n" + ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n" + ".inst 0x4f83e95f // sdot v31.4s, v10.16b, v3.4b[2]\n" + ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa3e89c // sdot v28.4s, v4.16b, v3.4b[3]\n" + ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4fa3e8bd // sdot v29.4s, v5.16b, v3.4b[3]\n" + ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8de // sdot v30.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8ff // sdot v31.4s, v7.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 104f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "104:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "105:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 112f\n" + "cmp x27, #0x4\n" + "blt 108f\n" + "106:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr s3, [x20], #0x4\n" + "tbnz %x[flags], #31, 107f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "107:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x20]\n" + ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q4, [x11, #0x30]\n" + ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" + "sub x27, x27, #0x4\n" + "add x11, x11, #0x40\n" + ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "cmp x27, #0x4\n" + ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" + ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" + "bge 106b\n" + "cbz x27, 112f\n" + "108:" // Height 4: Multiply loop: Skip odd blocks + "tbz x27, #1, 109f\n" + "ldr h0, [x26], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x22], #0x2\n" + "ldr h3, [x20], #0x2\n" + "tbz x27, #0, 110f\n" + "ld1 { v0.b }[2], [x26]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x22]\n" + "ld1 { v3.b }[2], [x20]\n" + "b 110f\n" + "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "ldr b3, [x20, #0x0]\n" + "110:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 111f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "111:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" + "ldr q7, [x11, #0x20]\n" + ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n" + "ldr q8, [x11, #0x30]\n" + ".inst 0x4f83e0bc // sdot v28.4s, v5.16b, v3.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0fe // sdot v30.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f83e11f // sdot v31.4s, v8.16b, v3.4b[0]\n" + "112:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 98b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 113f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "neg v4.4s, v4.4s\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "113:" // Height 4: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "add v20.4s, v20.4s, v12.4s\n" + "ldr q3, [x10, #0x30]\n" + "add v21.4s, v21.4s, v12.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v22.4s, v22.4s, v12.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "add v23.4s, v23.4s, v12.4s\n" + "add x10, x10, #0x40\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v28.4s, v28.4s, v14.4s\n" + "add v29.4s, v29.4s, v14.4s\n" + "add v30.4s, v30.4s, v14.4s\n" + "add v31.4s, v31.4s, v14.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v4.4s\n" + "sqrdmulh v30.4s, v30.4s, v4.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "tbz %x[flags], #5, 114f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "and v9.16b, v21.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "and v7.16b, v26.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v8.16b, v27.16b, v0.16b\n" + "and v9.16b, v28.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "and v10.16b, v29.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "and v4.16b, v30.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "and v5.16b, v31.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v9.4s\n" + "sqadd v29.4s, v29.4s, v10.4s\n" + "sqadd v30.4s, v30.4s, v4.4s\n" + "sqadd v31.4s, v31.4s, v5.4s\n" + "114:" // Height 4: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "srshl v30.4s, v30.4s, v0.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "srshl v31.4s, v31.4s, v0.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v28.8h, v28.8h, v29.8h\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 123f\n" + "tbz x12, #3, 118f\n" + "str d16, [x9], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x12, #2, 116f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" + "tbz x12, #1, 115f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x23], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[14], [x9]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x23]\n" + "st1 { v28.b }[14], [x21]\n" + "b 122f\n" + "115:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x12, #0, 122f\n" + "st1 { v16.b }[12], [x9]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x23]\n" + "st1 { v28.b }[12], [x21]\n" + "b 122f\n" + "116:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x12, #1, 117f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x23], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[10], [x9]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x23]\n" + "st1 { v28.b }[10], [x21]\n" + "b 122f\n" + "117:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x12, #0, 122f\n" + "st1 { v16.b }[8], [x9]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x23]\n" + "st1 { v28.b }[8], [x21]\n" + "b 122f\n" + "118:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x12, #2, 120f\n" + "str s16, [x9], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "str s28, [x21], #0x4\n" + "tbz x12, #1, 119f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x23], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[6], [x9]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x23]\n" + "st1 { v28.b }[6], [x21]\n" + "b 122f\n" + "119:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x12, #0, 122f\n" + "st1 { v16.b }[4], [x9]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x23]\n" + "st1 { v28.b }[4], [x21]\n" + "b 122f\n" + "120:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x12, #1, 121f\n" + "str h16, [x9], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x23], #0x2\n" + "str h28, [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[2], [x9]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x23]\n" + "st1 { v28.b }[2], [x21]\n" + "b 122f\n" + "121:" // Height 4: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x23, #0x0]\n" + "str b28, [x21, #0x0]\n" + "122:" // Height 4: Partial direct writeback: Done + "b 124f\n" + "123:" // Height 4: Full writeback + "str q16, [x9, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x23, #0x0]\n" + "str q28, [x21, #0x0]\n" + "add x9, x9, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "add x21, x21, #0x10\n" + "124:" // Height 4: Writeback done + "subs x12, x12, #0x10\n" + "bgt 96b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 126f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 125f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "125:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "126:" // Exit + + : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp new file mode 100644 index 0000000000..6d4f3b2efe --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_s8qs_dot_6x16( ARGLIST ); + +class cls_a64_hybrid_s8qs_dot_6x16 +{ +public: + typedef int8_t operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8qs_dot_6x16; + + cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp new file mode 100644 index 0000000000..0e98ab8347 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp @@ -0,0 +1,3613 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_s8qs_dot_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base +) +{ + struct KernelArgs { + const int32_t *multiplier_ptr = {}; + const int32_t *shift_ptr = {}; + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->per_channel_requant) { + flags |= 0x10; + ka.multiplier_ptr=qp->per_channel_muls + col_base; + ka.shift_ptr=qp->per_channel_right_shifts + col_base; + } + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 141f\n" + "cmp %x[M], #0x4\n" + "bgt 113f\n" + "beq 85f\n" + "cmp %x[M], #0x2\n" + "bgt 57f\n" + "beq 29f\n" + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "4:" // Height 1: setup done + "mov x12, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x11, #0x10\n" + "blt 10f\n" + "cmp x11, #0x20\n" + "blt 9f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + "cmp x11, #0x20\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "bge 8b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "10:" // Height 1: Multiply loop: Main loop skip + "cbz x11, 15f\n" + "cmp x11, #0x4\n" + "blt 12f\n" + "11:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x4\n" + "add x14, x14, #0x40\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "cmp x11, #0x4\n" + "bge 11b\n" + "cbz x11, 15f\n" + "12:" // Height 1: Multiply loop: Skip odd blocks + "tbz x11, #1, 13f\n" + "ldr h0, [x10], #0x2\n" + "tbz x11, #0, 14f\n" + "ld1 { v0.b }[2], [x10]\n" + "b 14f\n" + "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "14:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "15:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 5b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ldr q0, [x16, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "ldr q1, [x16, #0x10]\n" + "ldr q2, [x16, #0x20]\n" + "add v9.4s, v9.4s, v1.4s\n" + "ldr q3, [x16, #0x30]\n" + "add v10.4s, v10.4s, v2.4s\n" + "add x16, x16, #0x40\n" + "add v11.4s, v11.4s, v3.4s\n" + "tbz %x[flags], #4, 16f\n" + "ldr q0, [x17, #0x0]\n" + "ldr q4, [x8, #0x0]\n" + "ldr q1, [x17, #0x10]\n" + "ldr q5, [x8, #0x10]\n" + "ldr q2, [x17, #0x20]\n" + "ldr q6, [x8, #0x20]\n" + "ldr q3, [x17, #0x30]\n" + "ldr q7, [x8, #0x30]\n" + "add x17, x17, #0x40\n" + "add x8, x8, #0x40\n" + "b 17f\n" + "16:" // Height 1: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x19]\n" + "mov v1.16b, v0.16b\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "17:" // Height 1: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "tbz %x[flags], #5, 18f\n" + "and v4.16b, v8.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "18:" // Height 1: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x15, #0x10\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smax v11.4s, v11.4s, v5.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "bge 27f\n" + "tbz x15, #3, 22f\n" + "str d8, [x13], #0x8\n" + "tbz x15, #2, 20f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "tbz x15, #1, 19f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "tbz x15, #0, 26f\n" + "st1 { v8.b }[14], [x13]\n" + "b 26f\n" + "19:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x15, #0, 26f\n" + "st1 { v8.b }[12], [x13]\n" + "b 26f\n" + "20:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x15, #1, 21f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "tbz x15, #0, 26f\n" + "st1 { v8.b }[10], [x13]\n" + "b 26f\n" + "21:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x15, #0, 26f\n" + "st1 { v8.b }[8], [x13]\n" + "b 26f\n" + "22:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x15, #2, 24f\n" + "str s8, [x13], #0x4\n" + "tbz x15, #1, 23f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "tbz x15, #0, 26f\n" + "st1 { v8.b }[6], [x13]\n" + "b 26f\n" + "23:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x15, #0, 26f\n" + "st1 { v8.b }[4], [x13]\n" + "b 26f\n" + "24:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x15, #1, 25f\n" + "str h8, [x13], #0x2\n" + "tbz x15, #0, 26f\n" + "st1 { v8.b }[2], [x13]\n" + "b 26f\n" + "25:" // Height 1: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "26:" // Height 1: Partial direct writeback: Done + "b 28f\n" + "27:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "28:" // Height 1: Writeback done + "subs x15, x15, #0x10\n" + "bgt 3b\n" + "b 170f\n" + "29:" // Height 2 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 30f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "b 31f\n" + "30:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "31:" // Height 2: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "32:" // Height 2: setup done + "mov x12, #0x0\n" + "33:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 34f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 35f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "b 35f\n" + "34:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "35:" // Height 2: input setup done + "cmp x11, #0x10\n" + "blt 38f\n" + "cmp x11, #0x20\n" + "blt 37f\n" + "36:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "cmp x11, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "bge 36b\n" + "37:" // Height 2: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "38:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 43f\n" + "cmp x11, #0x4\n" + "blt 40f\n" + "39:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "bge 39b\n" + "cbz x11, 43f\n" + "40:" // Height 2: Multiply loop: Skip odd blocks + "tbz x11, #1, 41f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "tbz x11, #0, 42f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "b 42f\n" + "41:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "42:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "43:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 33b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "ldr q0, [x16, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "ldr q1, [x16, #0x10]\n" + "add v12.4s, v12.4s, v0.4s\n" + "ldr q2, [x16, #0x20]\n" + "ldr q3, [x16, #0x30]\n" + "add v9.4s, v9.4s, v1.4s\n" + "add x16, x16, #0x40\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "tbz %x[flags], #4, 44f\n" + "ldr q0, [x17, #0x0]\n" + "ldr q4, [x8, #0x0]\n" + "ldr q1, [x17, #0x10]\n" + "ldr q5, [x8, #0x10]\n" + "ldr q2, [x17, #0x20]\n" + "ldr q6, [x8, #0x20]\n" + "ldr q3, [x17, #0x30]\n" + "ldr q7, [x8, #0x30]\n" + "add x17, x17, #0x40\n" + "add x8, x8, #0x40\n" + "b 45f\n" + "44:" // Height 2: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x19]\n" + "mov v1.16b, v0.16b\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "45:" // Height 2: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "tbz %x[flags], #5, 46f\n" + "and v4.16b, v8.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "46:" // Height 2: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x15, #0x10\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "bge 55f\n" + "tbz x15, #3, 50f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "tbz x15, #2, 48f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "tbz x15, #1, 47f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x9], #0x2\n" + "tbz x15, #0, 54f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x9]\n" + "b 54f\n" + "47:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x15, #0, 54f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x9]\n" + "b 54f\n" + "48:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x15, #1, 49f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x9], #0x2\n" + "tbz x15, #0, 54f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x9]\n" + "b 54f\n" + "49:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x15, #0, 54f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x9]\n" + "b 54f\n" + "50:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x15, #2, 52f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "tbz x15, #1, 51f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x9], #0x2\n" + "tbz x15, #0, 54f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x9]\n" + "b 54f\n" + "51:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x15, #0, 54f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x9]\n" + "b 54f\n" + "52:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x15, #1, 53f\n" + "str h8, [x13], #0x2\n" + "str h12, [x9], #0x2\n" + "tbz x15, #0, 54f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x9]\n" + "b 54f\n" + "53:" // Height 2: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x9, #0x0]\n" + "54:" // Height 2: Partial direct writeback: Done + "b 56f\n" + "55:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "str q12, [x9, #0x0]\n" + "add x13, x13, #0x10\n" + "add x9, x9, #0x10\n" + "56:" // Height 2: Writeback done + "subs x15, x15, #0x10\n" + "bgt 31b\n" + "b 170f\n" + "57:" // Height 3 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 58f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "add x27, x27, x19\n" + "b 59f\n" + "58:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "59:" // Height 3: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "60:" // Height 3: setup done + "mov x12, #0x0\n" + "61:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 62f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 63f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "b 63f\n" + "62:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "63:" // Height 3: input setup done + "cmp x11, #0x10\n" + "blt 66f\n" + "cmp x11, #0x20\n" + "blt 65f\n" + "64:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "bge 64b\n" + "65:" // Height 3: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "66:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 71f\n" + "cmp x11, #0x4\n" + "blt 68f\n" + "67:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "cmp x11, #0x4\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "bge 67b\n" + "cbz x11, 71f\n" + "68:" // Height 3: Multiply loop: Skip odd blocks + "tbz x11, #1, 69f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "tbz x11, #0, 70f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "b 70f\n" + "69:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "70:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "71:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 61b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "ldr q0, [x16, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "prfm pstl1keep, [x27, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "add v12.4s, v12.4s, v0.4s\n" + "ldr q2, [x16, #0x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "ldr q3, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "tbz %x[flags], #4, 72f\n" + "ldr q0, [x17, #0x0]\n" + "ldr q4, [x8, #0x0]\n" + "ldr q1, [x17, #0x10]\n" + "ldr q5, [x8, #0x10]\n" + "ldr q2, [x17, #0x20]\n" + "ldr q6, [x8, #0x20]\n" + "ldr q3, [x17, #0x30]\n" + "ldr q7, [x8, #0x30]\n" + "add x17, x17, #0x40\n" + "add x8, x8, #0x40\n" + "b 73f\n" + "72:" // Height 3: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x19]\n" + "mov v1.16b, v0.16b\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "73:" // Height 3: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "tbz %x[flags], #5, 74f\n" + "and v4.16b, v8.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "74:" // Height 3: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x15, #0x10\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 83f\n" + "tbz x15, #3, 78f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "tbz x15, #2, 76f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "tbz x15, #1, 75f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x9], #0x2\n" + "st1 { v16.h }[6], [x27], #0x2\n" + "tbz x15, #0, 82f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x9]\n" + "st1 { v16.b }[14], [x27]\n" + "b 82f\n" + "75:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x15, #0, 82f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x9]\n" + "st1 { v16.b }[12], [x27]\n" + "b 82f\n" + "76:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x15, #1, 77f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x9], #0x2\n" + "st1 { v16.h }[4], [x27], #0x2\n" + "tbz x15, #0, 82f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x9]\n" + "st1 { v16.b }[10], [x27]\n" + "b 82f\n" + "77:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x15, #0, 82f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x9]\n" + "st1 { v16.b }[8], [x27]\n" + "b 82f\n" + "78:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x15, #2, 80f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "tbz x15, #1, 79f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x9], #0x2\n" + "st1 { v16.h }[2], [x27], #0x2\n" + "tbz x15, #0, 82f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x9]\n" + "st1 { v16.b }[6], [x27]\n" + "b 82f\n" + "79:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x15, #0, 82f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x9]\n" + "st1 { v16.b }[4], [x27]\n" + "b 82f\n" + "80:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x15, #1, 81f\n" + "str h8, [x13], #0x2\n" + "str h12, [x9], #0x2\n" + "str h16, [x27], #0x2\n" + "tbz x15, #0, 82f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x9]\n" + "st1 { v16.b }[2], [x27]\n" + "b 82f\n" + "81:" // Height 3: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x9, #0x0]\n" + "str b16, [x27, #0x0]\n" + "82:" // Height 3: Partial direct writeback: Done + "b 84f\n" + "83:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "str q12, [x9, #0x0]\n" + "str q16, [x27, #0x0]\n" + "add x13, x13, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "84:" // Height 3: Writeback done + "subs x15, x15, #0x10\n" + "bgt 59b\n" + "b 170f\n" + "85:" // Height 4 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 86f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19\n" + "add x25, x25, x19\n" + "b 87f\n" + "86:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "add x25, x27, x19\n" + "87:" // Height 4: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "88:" // Height 4: setup done + "mov x12, #0x0\n" + "89:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 90f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 91f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 91f\n" + "90:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "91:" // Height 4: input setup done + "cmp x11, #0x10\n" + "blt 94f\n" + "cmp x11, #0x20\n" + "blt 93f\n" + "92:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "bge 92b\n" + "93:" // Height 4: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "94:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 99f\n" + "cmp x11, #0x4\n" + "blt 96f\n" + "95:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "bge 95b\n" + "cbz x11, 99f\n" + "96:" // Height 4: Multiply loop: Skip odd blocks + "tbz x11, #1, 97f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "tbz x11, #0, 98f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "b 98f\n" + "97:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "98:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "99:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 89b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "ldr q0, [x16, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "prfm pstl1keep, [x27, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "add v12.4s, v12.4s, v0.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "ldr q2, [x16, #0x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "ldr q3, [x16, #0x30]\n" + "add v20.4s, v20.4s, v0.4s\n" + "add x16, x16, #0x40\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "tbz %x[flags], #4, 100f\n" + "ldr q0, [x17, #0x0]\n" + "ldr q4, [x8, #0x0]\n" + "ldr q1, [x17, #0x10]\n" + "ldr q5, [x8, #0x10]\n" + "ldr q2, [x17, #0x20]\n" + "ldr q6, [x8, #0x20]\n" + "ldr q3, [x17, #0x30]\n" + "ldr q7, [x8, #0x30]\n" + "add x17, x17, #0x40\n" + "add x8, x8, #0x40\n" + "b 101f\n" + "100:" // Height 4: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x19]\n" + "mov v1.16b, v0.16b\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "101:" // Height 4: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v5.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sqrdmulh v23.4s, v23.4s, v7.4s\n" + "tbz %x[flags], #5, 102f\n" + "and v4.16b, v8.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "and v5.16b, v21.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "and v6.16b, v22.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "and v7.16b, v23.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "sqadd v23.4s, v23.4s, v7.4s\n" + "102:" // Height 4: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x15, #0x10\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v2.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "srshl v23.4s, v23.4s, v3.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 111f\n" + "tbz x15, #3, 106f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x15, #2, 104f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "tbz x15, #1, 103f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x9], #0x2\n" + "st1 { v16.h }[6], [x27], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "tbz x15, #0, 110f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x9]\n" + "st1 { v16.b }[14], [x27]\n" + "st1 { v20.b }[14], [x25]\n" + "b 110f\n" + "103:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x15, #0, 110f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x9]\n" + "st1 { v16.b }[12], [x27]\n" + "st1 { v20.b }[12], [x25]\n" + "b 110f\n" + "104:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x15, #1, 105f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x9], #0x2\n" + "st1 { v16.h }[4], [x27], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "tbz x15, #0, 110f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x9]\n" + "st1 { v16.b }[10], [x27]\n" + "st1 { v20.b }[10], [x25]\n" + "b 110f\n" + "105:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x15, #0, 110f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x9]\n" + "st1 { v16.b }[8], [x27]\n" + "st1 { v20.b }[8], [x25]\n" + "b 110f\n" + "106:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x15, #2, 108f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "str s20, [x25], #0x4\n" + "tbz x15, #1, 107f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x9], #0x2\n" + "st1 { v16.h }[2], [x27], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "tbz x15, #0, 110f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x9]\n" + "st1 { v16.b }[6], [x27]\n" + "st1 { v20.b }[6], [x25]\n" + "b 110f\n" + "107:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x15, #0, 110f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x9]\n" + "st1 { v16.b }[4], [x27]\n" + "st1 { v20.b }[4], [x25]\n" + "b 110f\n" + "108:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x15, #1, 109f\n" + "str h8, [x13], #0x2\n" + "str h12, [x9], #0x2\n" + "str h16, [x27], #0x2\n" + "str h20, [x25], #0x2\n" + "tbz x15, #0, 110f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x9]\n" + "st1 { v16.b }[2], [x27]\n" + "st1 { v20.b }[2], [x25]\n" + "b 110f\n" + "109:" // Height 4: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x9, #0x0]\n" + "str b16, [x27, #0x0]\n" + "str b20, [x25, #0x0]\n" + "110:" // Height 4: Partial direct writeback: Done + "b 112f\n" + "111:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "str q12, [x9, #0x0]\n" + "str q16, [x27, #0x0]\n" + "str q20, [x25, #0x0]\n" + "add x13, x13, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "add x25, x25, #0x10\n" + "112:" // Height 4: Writeback done + "subs x15, x15, #0x10\n" + "bgt 87b\n" + "b 170f\n" + "113:" // Height 5 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 114f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "b 115f\n" + "114:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "add x25, x27, x19\n" + "add x23, x25, x19\n" + "115:" // Height 5: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "116:" // Height 5: setup done + "mov x12, #0x0\n" + "117:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 118f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 119f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 119f\n" + "118:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "119:" // Height 5: input setup done + "cmp x11, #0x10\n" + "blt 122f\n" + "cmp x11, #0x20\n" + "blt 121f\n" + "120:" // Height 5: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "bge 120b\n" + "121:" // Height 5: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "122:" // Height 5: Multiply loop: Main loop skip + "cbz x11, 127f\n" + "cmp x11, #0x4\n" + "blt 124f\n" + "123:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "bge 123b\n" + "cbz x11, 127f\n" + "124:" // Height 5: Multiply loop: Skip odd blocks + "tbz x11, #1, 125f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x11, #0, 126f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 126f\n" + "125:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "126:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "127:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 117b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "ldr q0, [x16, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "prfm pstl1keep, [x27, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "add v12.4s, v12.4s, v0.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "ldr q2, [x16, #0x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "prfm pstl1keep, [x23, #0x0]\n" + "ldr q3, [x16, #0x30]\n" + "add v20.4s, v20.4s, v0.4s\n" + "add x16, x16, #0x40\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "tbz %x[flags], #4, 128f\n" + "ldr q0, [x17, #0x0]\n" + "ldr q4, [x8, #0x0]\n" + "ldr q1, [x17, #0x10]\n" + "ldr q5, [x8, #0x10]\n" + "ldr q2, [x17, #0x20]\n" + "ldr q6, [x8, #0x20]\n" + "ldr q3, [x17, #0x30]\n" + "ldr q7, [x8, #0x30]\n" + "add x17, x17, #0x40\n" + "add x8, x8, #0x40\n" + "b 129f\n" + "128:" // Height 5: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x19]\n" + "mov v1.16b, v0.16b\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "129:" // Height 5: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v5.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sqrdmulh v23.4s, v23.4s, v7.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v5.4s\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "sqrdmulh v27.4s, v27.4s, v7.4s\n" + "tbz %x[flags], #5, 130f\n" + "and v4.16b, v8.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "and v5.16b, v21.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "and v6.16b, v22.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "and v7.16b, v23.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "and v4.16b, v24.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "and v5.16b, v25.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v7.4s\n" + "and v6.16b, v26.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "and v7.16b, v27.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v5.4s\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v7.4s\n" + "130:" // Height 5: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x15, #0x10\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v2.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "srshl v23.4s, v23.4s, v3.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v3.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 139f\n" + "tbz x15, #3, 134f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x15, #2, 132f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "tbz x15, #1, 131f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x9], #0x2\n" + "st1 { v16.h }[6], [x27], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x23], #0x2\n" + "tbz x15, #0, 138f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x9]\n" + "st1 { v16.b }[14], [x27]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x23]\n" + "b 138f\n" + "131:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x15, #0, 138f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x9]\n" + "st1 { v16.b }[12], [x27]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x23]\n" + "b 138f\n" + "132:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x15, #1, 133f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x9], #0x2\n" + "st1 { v16.h }[4], [x27], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x23], #0x2\n" + "tbz x15, #0, 138f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x9]\n" + "st1 { v16.b }[10], [x27]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x23]\n" + "b 138f\n" + "133:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x15, #0, 138f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x9]\n" + "st1 { v16.b }[8], [x27]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x23]\n" + "b 138f\n" + "134:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x15, #2, 136f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "tbz x15, #1, 135f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x9], #0x2\n" + "st1 { v16.h }[2], [x27], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x23], #0x2\n" + "tbz x15, #0, 138f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x9]\n" + "st1 { v16.b }[6], [x27]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x23]\n" + "b 138f\n" + "135:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x15, #0, 138f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x9]\n" + "st1 { v16.b }[4], [x27]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x23]\n" + "b 138f\n" + "136:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x15, #1, 137f\n" + "str h8, [x13], #0x2\n" + "str h12, [x9], #0x2\n" + "str h16, [x27], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x23], #0x2\n" + "tbz x15, #0, 138f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x9]\n" + "st1 { v16.b }[2], [x27]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x23]\n" + "b 138f\n" + "137:" // Height 5: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x9, #0x0]\n" + "str b16, [x27, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x23, #0x0]\n" + "138:" // Height 5: Partial direct writeback: Done + "b 140f\n" + "139:" // Height 5: Full writeback + "str q8, [x13, #0x0]\n" + "str q12, [x9, #0x0]\n" + "str q16, [x27, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x23, #0x0]\n" + "add x13, x13, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "140:" // Height 5: Writeback done + "subs x15, x15, #0x10\n" + "bgt 115b\n" + "b 170f\n" + "141:" // Height 6 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 142f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "add x21, x21, x19\n" + "b 143f\n" + "142:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "add x25, x27, x19\n" + "add x23, x25, x19\n" + "add x21, x23, x19\n" + "add %x[output_ptr], x21, x19\n" + "143:" // Height 6: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "144:" // Height 6: setup done + "mov x12, #0x0\n" + "145:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 146f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 147f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 147f\n" + "146:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "147:" // Height 6: input setup done + "cmp x11, #0x10\n" + "blt 150f\n" + "cmp x11, #0x20\n" + "blt 149f\n" + "148:" // Height 6: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "bge 148b\n" + "149:" // Height 6: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "150:" // Height 6: Multiply loop: Main loop skip + "cbz x11, 155f\n" + "cmp x11, #0x4\n" + "blt 152f\n" + "151:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "bge 151b\n" + "cbz x11, 155f\n" + "152:" // Height 6: Multiply loop: Skip odd blocks + "tbz x11, #1, 153f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "tbz x11, #0, 154f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x20]\n" + "b 154f\n" + "153:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x20, #0x0]\n" + "154:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "155:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 145b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "ldr q0, [x16, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "prfm pstl1keep, [x27, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "add v12.4s, v12.4s, v0.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "ldr q2, [x16, #0x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "prfm pstl1keep, [x23, #0x0]\n" + "ldr q3, [x16, #0x30]\n" + "add v20.4s, v20.4s, v0.4s\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x16, x16, #0x40\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "tbz %x[flags], #4, 156f\n" + "ldr q0, [x17, #0x0]\n" + "ldr q4, [x8, #0x0]\n" + "ldr q1, [x17, #0x10]\n" + "ldr q5, [x8, #0x10]\n" + "ldr q2, [x17, #0x20]\n" + "ldr q6, [x8, #0x20]\n" + "ldr q3, [x17, #0x30]\n" + "ldr q7, [x8, #0x30]\n" + "add x17, x17, #0x40\n" + "add x8, x8, #0x40\n" + "b 157f\n" + "156:" // Height 6: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x19]\n" + "mov v1.16b, v0.16b\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "mov v2.16b, v0.16b\n" + "mov v3.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "157:" // Height 6: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v5.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sqrdmulh v23.4s, v23.4s, v7.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v5.4s\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "sqrdmulh v27.4s, v27.4s, v7.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v5.4s\n" + "sqrdmulh v30.4s, v30.4s, v6.4s\n" + "sqrdmulh v31.4s, v31.4s, v7.4s\n" + "tbz %x[flags], #5, 158f\n" + "and v4.16b, v8.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v6.16b, v14.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "and v5.16b, v21.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "and v6.16b, v22.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "and v7.16b, v23.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "and v4.16b, v24.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "and v5.16b, v25.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v7.4s\n" + "and v6.16b, v26.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "and v7.16b, v27.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v5.4s\n" + "and v4.16b, v28.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "and v5.16b, v29.16b, v1.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v7.4s\n" + "and v6.16b, v30.16b, v2.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v4.4s\n" + "and v7.16b, v31.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v29.4s, v29.4s, v5.4s\n" + "sqadd v30.4s, v30.4s, v6.4s\n" + "sqadd v31.4s, v31.4s, v7.4s\n" + "158:" // Height 6: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x15, #0x10\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v2.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "srshl v23.4s, v23.4s, v3.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v3.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "srshl v29.4s, v29.4s, v1.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "srshl v30.4s, v30.4s, v2.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "srshl v31.4s, v31.4s, v3.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v28.8h, v28.8h, v29.8h\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 167f\n" + "tbz x15, #3, 162f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x15, #2, 160f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x9], #0x4\n" + "st1 { v16.s }[2], [x27], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" + "tbz x15, #1, 159f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x9], #0x2\n" + "st1 { v16.h }[6], [x27], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x23], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" + "tbz x15, #0, 166f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x9]\n" + "st1 { v16.b }[14], [x27]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x23]\n" + "st1 { v28.b }[14], [x21]\n" + "b 166f\n" + "159:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x15, #0, 166f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x9]\n" + "st1 { v16.b }[12], [x27]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x23]\n" + "st1 { v28.b }[12], [x21]\n" + "b 166f\n" + "160:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x15, #1, 161f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x9], #0x2\n" + "st1 { v16.h }[4], [x27], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x23], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" + "tbz x15, #0, 166f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x9]\n" + "st1 { v16.b }[10], [x27]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x23]\n" + "st1 { v28.b }[10], [x21]\n" + "b 166f\n" + "161:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x15, #0, 166f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x9]\n" + "st1 { v16.b }[8], [x27]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x23]\n" + "st1 { v28.b }[8], [x21]\n" + "b 166f\n" + "162:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x15, #2, 164f\n" + "str s8, [x13], #0x4\n" + "str s12, [x9], #0x4\n" + "str s16, [x27], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "str s28, [x21], #0x4\n" + "tbz x15, #1, 163f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x9], #0x2\n" + "st1 { v16.h }[2], [x27], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x23], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" + "tbz x15, #0, 166f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x9]\n" + "st1 { v16.b }[6], [x27]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x23]\n" + "st1 { v28.b }[6], [x21]\n" + "b 166f\n" + "163:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x15, #0, 166f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x9]\n" + "st1 { v16.b }[4], [x27]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x23]\n" + "st1 { v28.b }[4], [x21]\n" + "b 166f\n" + "164:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x15, #1, 165f\n" + "str h8, [x13], #0x2\n" + "str h12, [x9], #0x2\n" + "str h16, [x27], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x23], #0x2\n" + "str h28, [x21], #0x2\n" + "tbz x15, #0, 166f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x9]\n" + "st1 { v16.b }[2], [x27]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x23]\n" + "st1 { v28.b }[2], [x21]\n" + "b 166f\n" + "165:" // Height 6: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x9, #0x0]\n" + "str b16, [x27, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x23, #0x0]\n" + "str b28, [x21, #0x0]\n" + "166:" // Height 6: Partial direct writeback: Done + "b 168f\n" + "167:" // Height 6: Full writeback + "str q8, [x13, #0x0]\n" + "str q12, [x9, #0x0]\n" + "str q16, [x27, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x23, #0x0]\n" + "str q28, [x21, #0x0]\n" + "add x13, x13, #0x10\n" + "add x9, x9, #0x10\n" + "add x27, x27, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "add x21, x21, #0x10\n" + "168:" // Height 6: Writeback done + "subs x15, x15, #0x10\n" + "bgt 143b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 170f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 169f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "169:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "170:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp deleted file mode 100644 index a23101a7ce..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); -void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - -class hybrid_s8s32_dot_16x4 -{ -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return 16; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_hybrid_s8s32_dot_16x4; - - hybrid_s8s32_dot_16x4(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::A55r1) { - kernel = a64_hybrid_s8s32_dot_16x4_a55; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp deleted file mode 100644 index 4a7cdc59a7..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp +++ /dev/null @@ -1,2434 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 16) / 32) - 1; - K -= loops_count * 32; - const long regs_count = (K / 16) - 1; - K -= (regs_count + 1) * 16; - const long blocks_count = K / 4; - const long odds_count = K - (blocks_count * 4); - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "temploadreg0 .req X2\n" - "temploadreg1 .req X3\n" - "temploadreg2 .req X4\n" - "temploadreg3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v19.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v20.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v21.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v22.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v23.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "subs %[loops], %[loops], #0x1\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "ins v15.d[1], temploadreg3\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - "ldr d8, [%[b_ptr0]]\n" - "ins v0.d[1], temploadreg0\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "ins v1.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "ins v11.d[1], temploadreg3\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "b.ne 3b\n" - "2:\n" - "ins v14.d[1], temploadreg2\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "temploadreg0 .req X4\n" - "temploadreg1 .req X5\n" - "temploadreg2 .req X6\n" - "temploadreg3 .req X7\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v20.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v21.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v22.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v23.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v24.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v25.4s, #0\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "movi v26.4s, #0\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "movi v27.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ins v14.d[1], temploadreg2\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "ins v14.d[1], temploadreg2\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - "ins v0.d[1], temploadreg0\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - "ins v1.d[1], temploadreg1\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "ins v15.d[1], temploadreg3\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "ldr d2, [a_ptr2, #-0x10]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - "ldr temploadreg2, [a_ptr2, #-0x8]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - "ins v2.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "ins v11.d[1], temploadreg3\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "ins v14.d[1], temploadreg2\n" - "b.ne 3b\n" - "2:\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "temploadreg0 .req X6\n" - "temploadreg1 .req X7\n" - "temploadreg2 .req X8\n" - "temploadreg3 .req X9\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q3, [a_ptr3]\n" - "movi v20.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v21.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v22.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v23.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v24.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v25.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v26.4s, #0\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "movi v27.4s, #0\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "movi v28.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "movi v29.4s, #0\n" - "ins v14.d[1], temploadreg2\n" - "movi v30.4s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "movi v31.4s, #0\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q28, [c_ptr3]\n" - "ldr q29, [c_ptr3, #0x10]\n" - "ldr q30, [c_ptr3, #0x20]\n" - "ldr q31, [c_ptr3, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "ins v14.d[1], temploadreg2\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d7, [a_ptr3]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr temploadreg3, [a_ptr3, #0x8]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ins v7.d[1], temploadreg3\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - "ins v0.d[1], temploadreg0\n" - ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - "ins v1.d[1], temploadreg1\n" - ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - "ldr d2, [a_ptr2, #-0x10]\n" - ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" - "ldr temploadreg2, [a_ptr2, #-0x8]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - "ins v2.d[1], temploadreg2\n" - ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - "ldr d3, [a_ptr3, #-0x10]\n" - ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" - "ldr temploadreg3, [a_ptr3, #-0x8]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - "ins v3.d[1], temploadreg3\n" - ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "ins v14.d[1], temploadreg2\n" - "b.ne 3b\n" - "2:\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr d7, [a_ptr3]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - "ldr temploadreg3, [a_ptr3, #0x8]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ins v7.d[1], temploadreg3\n" - ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "add a_ptr3, a_ptr3, #0x10\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr s3, [a_ptr3]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "ld1 {v3.b}[0], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "ld1 {v3.b}[1], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "ld1 {v3.b}[2], [a_ptr3]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - "str q28, [c_ptr3]\n" - "str q29, [c_ptr3, #0x10]\n" - "str q30, [c_ptr3, #0x20]\n" - "str q31, [c_ptr3, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy - -#include "arm_gemm.hpp" -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 16) / 32) - 1; - K -= loops_count * 32; - const long regs_count = (K / 16) - 1; - K -= (regs_count + 1) * 16; - const long blocks_count = K / 4; - const long odds_count = K - (blocks_count * 4); - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v19.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v20.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v21.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v22.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v23.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "cbz %[regs], 4f\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v20.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v21.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v22.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v23.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v24.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v25.4s, #0\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "movi v26.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "movi v27.4s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "cbz %[regs], 4f\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q3, [a_ptr3]\n" - "movi v20.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v21.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v22.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v23.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v24.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v25.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v26.4s, #0\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "movi v27.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "movi v28.4s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "movi v29.4s, #0\n" - "add a_ptr2, a_ptr2, #0x10\n" - "movi v30.4s, #0\n" - "add a_ptr3, a_ptr3, #0x10\n" - "movi v31.4s, #0\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q28, [c_ptr3]\n" - "ldr q29, [c_ptr3, #0x10]\n" - "ldr q30, [c_ptr3, #0x20]\n" - "ldr q31, [c_ptr3, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q7, [a_ptr3]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "cbz %[regs], 4f\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - "ldr q7, [a_ptr3]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - "add a_ptr3, a_ptr3, #0x10\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" - ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" - ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" - ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" - ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" - ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" - ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" - ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" - ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" - ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" - ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" - ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" - ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" - ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" - ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" - ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" - ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" - ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" - ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" - ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" - ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" - ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" - ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" - ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" - ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" - ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" - ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" - ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" - ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" - ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "ldr s3, [a_ptr3]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "ld1 {v3.b}[0], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "ld1 {v3.b}[1], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "ld1 {v3.b}[2], [a_ptr3]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - "str q28, [c_ptr3]\n" - "str q29, [c_ptr3, #0x10]\n" - "str q30, [c_ptr3, #0x20]\n" - "str q31, [c_ptr3, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const int32_t *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_s8s32_dot_6x16( ARGLIST ); + +class cls_a64_hybrid_s8s32_dot_6x16 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8s32_dot_6x16; + + cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp new file mode 100644 index 0000000000..3257986410 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp @@ -0,0 +1,3335 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_s8s32_dot_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const int32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 176f\n" + "cmp %x[M], #0x4\n" + "bgt 141f\n" + "beq 106f\n" + "cmp %x[M], #0x2\n" + "bgt 71f\n" + "beq 36f\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "tbz %x[flags], #0, 13f\n" + "cmp x15, #0x10\n" + "bge 12f\n" + "tbz x15, #3, 7f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "tbz x15, #2, 5f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "tbz x15, #1, 4f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "tbz x15, #0, 11f\n" + "ld1 { v11.s }[2], [x13]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 11f\n" + "ldr s11, [x13, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x15, #1, 6f\n" + "ldr d10, [x13], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 11f\n" + "ld1 { v10.s }[2], [x13]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 11f\n" + "ldr s10, [x13, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x15, #2, 9f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "tbz x15, #1, 8f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "tbz x15, #0, 11f\n" + "ld1 { v9.s }[2], [x13]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 11f\n" + "ldr s9, [x13, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x15, #1, 10f\n" + "ldr d8, [x13], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 11f\n" + "ld1 { v8.s }[2], [x13]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "11:" // Height 1: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 14f\n" + "12:" // Height 1: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "b 14f\n" + "13:" // Height 1: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "14:" // Height 1: setup done + "mov x12, #0x0\n" + "15:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x11, #0x10\n" + "blt 20f\n" + "cmp x11, #0x20\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + "cmp x11, #0x20\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x11, 25f\n" + "cmp x11, #0x4\n" + "blt 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x4\n" + "add x14, x14, #0x40\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "cmp x11, #0x4\n" + "bge 21b\n" + "cbz x11, 25f\n" + "22:" // Height 1: Multiply loop: Skip odd blocks + "tbz x11, #1, 23f\n" + "ldr h0, [x10], #0x2\n" + "tbz x11, #0, 24f\n" + "ld1 { v0.b }[2], [x10]\n" + "b 24f\n" + "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "24:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "25:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 15b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "cmp x15, #0x10\n" + "bge 34f\n" + "tbz x15, #3, 29f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "tbz x15, #2, 27f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "tbz x15, #1, 26f\n" + "str d11, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v11.s }[2], [x13]\n" + "b 33f\n" + "26:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x15, #0, 33f\n" + "str s11, [x13, #0x0]\n" + "b 33f\n" + "27:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x15, #1, 28f\n" + "str d10, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v10.s }[2], [x13]\n" + "b 33f\n" + "28:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x15, #0, 33f\n" + "str s10, [x13, #0x0]\n" + "b 33f\n" + "29:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x15, #2, 31f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "tbz x15, #1, 30f\n" + "str d9, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v9.s }[2], [x13]\n" + "b 33f\n" + "30:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x15, #0, 33f\n" + "str s9, [x13, #0x0]\n" + "b 33f\n" + "31:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x15, #1, 32f\n" + "str d8, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v8.s }[2], [x13]\n" + "b 33f\n" + "32:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "33:" // Height 1: Partial direct writeback: Done + "b 35f\n" + "34:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "35:" // Height 1: Writeback done + "subs x15, x15, #0x10\n" + "bgt 3b\n" + "b 212f\n" + "36:" // Height 2 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 37f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 38f\n" + "37:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "38:" // Height 2: Column loop + "tbz %x[flags], #0, 48f\n" + "cmp x15, #0x10\n" + "bge 47f\n" + "tbz x15, #3, 42f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "tbz x15, #2, 40f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "tbz x15, #1, 39f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "tbz x15, #0, 46f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "b 46f\n" + "39:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 46f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "b 46f\n" + "40:" // Height 2: Partial accumulate: partial_2_8 + "tbz x15, #1, 41f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 46f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "b 46f\n" + "41:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 46f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "b 46f\n" + "42:" // Height 2: Partial accumulate: partial_4_0 + "tbz x15, #2, 44f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "tbz x15, #1, 43f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "tbz x15, #0, 46f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "b 46f\n" + "43:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 46f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "b 46f\n" + "44:" // Height 2: Partial accumulate: partial_2_0 + "tbz x15, #1, 45f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 46f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "b 46f\n" + "45:" // Height 2: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "46:" // Height 2: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "b 49f\n" + "47:" // Height 2: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "b 49f\n" + "48:" // Height 2: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "49:" // Height 2: setup done + "mov x12, #0x0\n" + "50:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 51f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "b 52f\n" + "51:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "52:" // Height 2: input setup done + "cmp x11, #0x10\n" + "blt 55f\n" + "cmp x11, #0x20\n" + "blt 54f\n" + "53:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "cmp x11, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "bge 53b\n" + "54:" // Height 2: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "55:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 60f\n" + "cmp x11, #0x4\n" + "blt 57f\n" + "56:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "bge 56b\n" + "cbz x11, 60f\n" + "57:" // Height 2: Multiply loop: Skip odd blocks + "tbz x11, #1, 58f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "tbz x11, #0, 59f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "b 59f\n" + "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "59:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "60:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 50b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "bge 69f\n" + "tbz x15, #3, 64f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "tbz x15, #2, 62f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "tbz x15, #1, 61f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "b 68f\n" + "61:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x15, #0, 68f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "b 68f\n" + "62:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x15, #1, 63f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "b 68f\n" + "63:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x15, #0, 68f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "b 68f\n" + "64:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x15, #2, 66f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "tbz x15, #1, 65f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "b 68f\n" + "65:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x15, #0, 68f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "b 68f\n" + "66:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x15, #1, 67f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "b 68f\n" + "67:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "68:" // Height 2: Partial direct writeback: Done + "b 70f\n" + "69:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "70:" // Height 2: Writeback done + "subs x15, x15, #0x10\n" + "bgt 38b\n" + "b 212f\n" + "71:" // Height 3 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 72f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 73f\n" + "72:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "73:" // Height 3: Column loop + "tbz %x[flags], #0, 83f\n" + "cmp x15, #0x10\n" + "bge 82f\n" + "tbz x15, #3, 77f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "tbz x15, #2, 75f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "tbz x15, #1, 74f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "tbz x15, #0, 81f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "b 81f\n" + "74:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 81f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "b 81f\n" + "75:" // Height 3: Partial accumulate: partial_2_8 + "tbz x15, #1, 76f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 81f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "b 81f\n" + "76:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 81f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "b 81f\n" + "77:" // Height 3: Partial accumulate: partial_4_0 + "tbz x15, #2, 79f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "tbz x15, #1, 78f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "tbz x15, #0, 81f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "b 81f\n" + "78:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 81f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "b 81f\n" + "79:" // Height 3: Partial accumulate: partial_2_0 + "tbz x15, #1, 80f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 81f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "b 81f\n" + "80:" // Height 3: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "81:" // Height 3: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "b 84f\n" + "82:" // Height 3: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "b 84f\n" + "83:" // Height 3: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "84:" // Height 3: setup done + "mov x12, #0x0\n" + "85:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 86f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 87f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "b 87f\n" + "86:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "87:" // Height 3: input setup done + "cmp x11, #0x10\n" + "blt 90f\n" + "cmp x11, #0x20\n" + "blt 89f\n" + "88:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "bge 88b\n" + "89:" // Height 3: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "90:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 95f\n" + "cmp x11, #0x4\n" + "blt 92f\n" + "91:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "cmp x11, #0x4\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "bge 91b\n" + "cbz x11, 95f\n" + "92:" // Height 3: Multiply loop: Skip odd blocks + "tbz x11, #1, 93f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "tbz x11, #0, 94f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "b 94f\n" + "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "94:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "95:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 85b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "bge 104f\n" + "tbz x15, #3, 99f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "tbz x15, #2, 97f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "tbz x15, #1, 96f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "b 103f\n" + "96:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x15, #0, 103f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "b 103f\n" + "97:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x15, #1, 98f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "b 103f\n" + "98:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x15, #0, 103f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "b 103f\n" + "99:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x15, #2, 101f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "tbz x15, #1, 100f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "b 103f\n" + "100:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x15, #0, 103f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "b 103f\n" + "101:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x15, #1, 102f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "b 103f\n" + "102:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "103:" // Height 3: Partial direct writeback: Done + "b 105f\n" + "104:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "105:" // Height 3: Writeback done + "subs x15, x15, #0x10\n" + "bgt 73b\n" + "b 212f\n" + "106:" // Height 4 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 107f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 108f\n" + "107:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "108:" // Height 4: Column loop + "tbz %x[flags], #0, 118f\n" + "cmp x15, #0x10\n" + "bge 117f\n" + "tbz x15, #3, 112f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "tbz x15, #2, 110f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "tbz x15, #1, 109f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "tbz x15, #0, 116f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "b 116f\n" + "109:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 116f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "b 116f\n" + "110:" // Height 4: Partial accumulate: partial_2_8 + "tbz x15, #1, 111f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 116f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "b 116f\n" + "111:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 116f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "b 116f\n" + "112:" // Height 4: Partial accumulate: partial_4_0 + "tbz x15, #2, 114f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "tbz x15, #1, 113f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "tbz x15, #0, 116f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "b 116f\n" + "113:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 116f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "b 116f\n" + "114:" // Height 4: Partial accumulate: partial_2_0 + "tbz x15, #1, 115f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 116f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "b 116f\n" + "115:" // Height 4: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "116:" // Height 4: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "b 119f\n" + "117:" // Height 4: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "b 119f\n" + "118:" // Height 4: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "119:" // Height 4: setup done + "mov x12, #0x0\n" + "120:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 121f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 122f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 122f\n" + "121:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "122:" // Height 4: input setup done + "cmp x11, #0x10\n" + "blt 125f\n" + "cmp x11, #0x20\n" + "blt 124f\n" + "123:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "bge 123b\n" + "124:" // Height 4: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "125:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 130f\n" + "cmp x11, #0x4\n" + "blt 127f\n" + "126:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "bge 126b\n" + "cbz x11, 130f\n" + "127:" // Height 4: Multiply loop: Skip odd blocks + "tbz x11, #1, 128f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "tbz x11, #0, 129f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "b 129f\n" + "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "129:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "130:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 120b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "bge 139f\n" + "tbz x15, #3, 134f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "tbz x15, #2, 132f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "tbz x15, #1, 131f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "b 138f\n" + "131:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x15, #0, 138f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "b 138f\n" + "132:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x15, #1, 133f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "b 138f\n" + "133:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x15, #0, 138f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "b 138f\n" + "134:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x15, #2, 136f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "tbz x15, #1, 135f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "b 138f\n" + "135:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x15, #0, 138f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "b 138f\n" + "136:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x15, #1, 137f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "b 138f\n" + "137:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "138:" // Height 4: Partial direct writeback: Done + "b 140f\n" + "139:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "140:" // Height 4: Writeback done + "subs x15, x15, #0x10\n" + "bgt 108b\n" + "b 212f\n" + "141:" // Height 5 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 142f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 143f\n" + "142:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "143:" // Height 5: Column loop + "tbz %x[flags], #0, 153f\n" + "cmp x15, #0x10\n" + "bge 152f\n" + "tbz x15, #3, 147f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "tbz x15, #2, 145f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "tbz x15, #1, 144f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "tbz x15, #0, 151f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "b 151f\n" + "144:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 151f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "b 151f\n" + "145:" // Height 5: Partial accumulate: partial_2_8 + "tbz x15, #1, 146f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 151f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "b 151f\n" + "146:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 151f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "b 151f\n" + "147:" // Height 5: Partial accumulate: partial_4_0 + "tbz x15, #2, 149f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "tbz x15, #1, 148f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "tbz x15, #0, 151f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "b 151f\n" + "148:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 151f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "b 151f\n" + "149:" // Height 5: Partial accumulate: partial_2_0 + "tbz x15, #1, 150f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 151f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "b 151f\n" + "150:" // Height 5: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "151:" // Height 5: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "b 154f\n" + "152:" // Height 5: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "b 154f\n" + "153:" // Height 5: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "154:" // Height 5: setup done + "mov x12, #0x0\n" + "155:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 156f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 157f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 157f\n" + "156:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "157:" // Height 5: input setup done + "cmp x11, #0x10\n" + "blt 160f\n" + "cmp x11, #0x20\n" + "blt 159f\n" + "158:" // Height 5: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "bge 158b\n" + "159:" // Height 5: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "160:" // Height 5: Multiply loop: Main loop skip + "cbz x11, 165f\n" + "cmp x11, #0x4\n" + "blt 162f\n" + "161:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "bge 161b\n" + "cbz x11, 165f\n" + "162:" // Height 5: Multiply loop: Skip odd blocks + "tbz x11, #1, 163f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x11, #0, 164f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 164f\n" + "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "164:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "165:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 155b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "bge 174f\n" + "tbz x15, #3, 169f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "tbz x15, #2, 167f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "tbz x15, #1, 166f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "b 173f\n" + "166:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x15, #0, 173f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "b 173f\n" + "167:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x15, #1, 168f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "b 173f\n" + "168:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x15, #0, 173f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "b 173f\n" + "169:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x15, #2, 171f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "tbz x15, #1, 170f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "b 173f\n" + "170:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x15, #0, 173f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "b 173f\n" + "171:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x15, #1, 172f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "b 173f\n" + "172:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "173:" // Height 5: Partial direct writeback: Done + "b 175f\n" + "174:" // Height 5: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "175:" // Height 5: Writeback done + "subs x15, x15, #0x10\n" + "bgt 143b\n" + "b 212f\n" + "176:" // Height 6 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 177f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 178f\n" + "177:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "178:" // Height 6: Column loop + "tbz %x[flags], #0, 188f\n" + "cmp x15, #0x10\n" + "bge 187f\n" + "tbz x15, #3, 182f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "ld1 { v29.4s }, [x21], #0x10\n" + "tbz x15, #2, 180f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "ld1 { v30.4s }, [x21], #0x10\n" + "tbz x15, #1, 179f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x15, #0, 186f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 186f\n" + "179:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 186f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "b 186f\n" + "180:" // Height 6: Partial accumulate: partial_2_8 + "tbz x15, #1, 181f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d30, [x21], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 186f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "ld1 { v30.s }[2], [x21]\n" + "b 186f\n" + "181:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 186f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "b 186f\n" + "182:" // Height 6: Partial accumulate: partial_4_0 + "tbz x15, #2, 184f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "tbz x15, #1, 183f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x15, #0, 186f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x21]\n" + "b 186f\n" + "183:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 186f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "ldr s29, [x21, #0x0]\n" + "b 186f\n" + "184:" // Height 6: Partial accumulate: partial_2_0 + "tbz x15, #1, 185f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d28, [x21], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 186f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v28.s }[2], [x21]\n" + "b 186f\n" + "185:" // Height 6: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s28, [x21, #0x0]\n" + "186:" // Height 6: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "sub x21, x21, x19\n" + "b 189f\n" + "187:" // Height 6: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "ldr q28, [x21, #0x0]\n" + "ldr q29, [x21, #0x10]\n" + "ldr q30, [x21, #0x20]\n" + "ldr q31, [x21, #0x30]\n" + "b 189f\n" + "188:" // Height 6: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "189:" // Height 6: setup done + "mov x12, #0x0\n" + "190:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 191f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 192f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 192f\n" + "191:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "192:" // Height 6: input setup done + "cmp x11, #0x10\n" + "blt 195f\n" + "cmp x11, #0x20\n" + "blt 194f\n" + "193:" // Height 6: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "bge 193b\n" + "194:" // Height 6: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "195:" // Height 6: Multiply loop: Main loop skip + "cbz x11, 200f\n" + "cmp x11, #0x4\n" + "blt 197f\n" + "196:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "bge 196b\n" + "cbz x11, 200f\n" + "197:" // Height 6: Multiply loop: Skip odd blocks + "tbz x11, #1, 198f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "tbz x11, #0, 199f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x20]\n" + "b 199f\n" + "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x20, #0x0]\n" + "199:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "200:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 190b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "bge 209f\n" + "tbz x15, #3, 204f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "tbz x15, #2, 202f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "st1 { v30.4s }, [x21], #0x10\n" + "tbz x15, #1, 201f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "st1 { v31.s }[2], [x21]\n" + "b 208f\n" + "201:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x15, #0, 208f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "str s31, [x21, #0x0]\n" + "b 208f\n" + "202:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x15, #1, 203f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "st1 { v30.s }[2], [x21]\n" + "b 208f\n" + "203:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x15, #0, 208f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "str s30, [x21, #0x0]\n" + "b 208f\n" + "204:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x15, #2, 206f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "tbz x15, #1, 205f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "st1 { v29.s }[2], [x21]\n" + "b 208f\n" + "205:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x15, #0, 208f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s29, [x21, #0x0]\n" + "b 208f\n" + "206:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x15, #1, 207f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "st1 { v28.s }[2], [x21]\n" + "b 208f\n" + "207:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "str s28, [x21, #0x0]\n" + "208:" // Height 6: Partial direct writeback: Done + "b 210f\n" + "209:" // Height 6: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "str q28, [x21, #0x0]\n" + "str q29, [x21, #0x10]\n" + "str q30, [x21, #0x20]\n" + "str q31, [x21, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "add x21, x21, #0x40\n" + "210:" // Height 6: Writeback done + "subs x15, x15, #0x10\n" + "bgt 178b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 212f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 211f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "211:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "212:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp new file mode 100644 index 0000000000..5b4a7f3e86 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_u8qa_dot_4x16( ARGLIST ); + +class cls_a64_hybrid_u8qa_dot_4x16 +{ +public: + typedef uint8_t operand_type; + typedef uint8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_u8qa_dot_4x16; + + cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp new file mode 100644 index 0000000000..ff12472063 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp @@ -0,0 +1,2072 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_u8qa_dot_4x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 94f\n" + "cmp %x[M], #0x2\n" + "bgt 63f\n" + "beq 32f\n" + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[col_bias]\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 2f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "add x9, x9, x19\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x9, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "4:" // Height 1: setup done + "mov x28, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "cbnz x28, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x27, #0x10\n" + "blt 12f\n" + "cmp x27, #0x20\n" + "blt 10f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + "ldr q6, [x11, #0x20]\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x30]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q8, [x11, #0x40]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q5, [x11, #0x80]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q10, [x11, #0xd0]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + "add x11, x11, #0x100\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 9f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "9:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 8b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + "ldr q8, [x11, #0x20]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x30]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x40]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" + "ldr q7, [x11, #0x80]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" + "ldr q5, [x11, #0xd0]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" + "add x11, x11, #0x100\n" + ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 11f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "11:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "12:" // Height 1: Multiply loop: Main loop skip + "cbz x27, 19f\n" + "cmp x27, #0x4\n" + "blt 15f\n" + "13:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "tbnz %x[flags], #31, 14f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "14:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + "ldr q10, [x11, #0x20]\n" + ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q4, [x11, #0x30]\n" + ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "sub x27, x27, #0x4\n" + "add x11, x11, #0x40\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + "cmp x27, #0x4\n" + "bge 13b\n" + "cbz x27, 19f\n" + "15:" // Height 1: Multiply loop: Skip odd blocks + "tbz x27, #1, 16f\n" + "ldr h0, [x26], #0x2\n" + "tbz x27, #0, 17f\n" + "ld1 { v0.b }[2], [x26]\n" + "b 17f\n" + "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "17:" // Height 1: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 18f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "18:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + "ldr q7, [x11, #0x20]\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + "ldr q8, [x11, #0x30]\n" + ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" + "19:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 5b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbnz %x[flags], #31, 20f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "addp v11.4s, v11.4s, v11.4s\n" + "ld1r { v1.4s }, [x19]\n" + "neg v1.4s, v1.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" + "20:" // Height 1: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "ldr q3, [x10, #0x30]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add v17.4s, v17.4s, v1.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add v18.4s, v18.4s, v2.4s\n" + "ld1r { v4.4s }, [x19]\n" + "add x10, x10, #0x40\n" + "add v19.4s, v19.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 21f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "21:" // Height 1: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 30f\n" + "tbz x12, #3, 25f\n" + "str d16, [x9], #0x8\n" + "tbz x12, #2, 23f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "tbz x12, #1, 22f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[14], [x9]\n" + "b 29f\n" + "22:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x12, #0, 29f\n" + "st1 { v16.b }[12], [x9]\n" + "b 29f\n" + "23:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x12, #1, 24f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[10], [x9]\n" + "b 29f\n" + "24:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x12, #0, 29f\n" + "st1 { v16.b }[8], [x9]\n" + "b 29f\n" + "25:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x12, #2, 27f\n" + "str s16, [x9], #0x4\n" + "tbz x12, #1, 26f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[6], [x9]\n" + "b 29f\n" + "26:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x12, #0, 29f\n" + "st1 { v16.b }[4], [x9]\n" + "b 29f\n" + "27:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x12, #1, 28f\n" + "str h16, [x9], #0x2\n" + "tbz x12, #0, 29f\n" + "st1 { v16.b }[2], [x9]\n" + "b 29f\n" + "28:" // Height 1: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "29:" // Height 1: Partial direct writeback: Done + "b 31f\n" + "30:" // Height 1: Full writeback + "str q16, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "31:" // Height 1: Writeback done + "subs x12, x12, #0x10\n" + "bgt 3b\n" + "b 126f\n" + "32:" // Height 2 + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 33f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "add x25, x25, x19\n" + "b 34f\n" + "33:" // Height 2: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "34:" // Height 2: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "35:" // Height 2: setup done + "mov x28, #0x0\n" + "36:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 37f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x28, 38f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 38f\n" + "37:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "38:" // Height 2: input setup done + "cmp x27, #0x10\n" + "blt 43f\n" + "cmp x27, #0x20\n" + "blt 41f\n" + "39:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x20]\n" + "ldr q7, [x11, #0x30]\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q8, [x11, #0x40]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x80]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + "add x26, x26, #0x10\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + "add x24, x24, #0x10\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + "ldr q10, [x11, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "40:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x20\n" + "bge 39b\n" + "41:" // Height 2: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x20]\n" + "ldr q9, [x11, #0x30]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x40]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x80]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" + "add x26, x26, #0x10\n" + ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" + "add x24, x24, #0x10\n" + ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n" + "ldr q5, [x11, #0xd0]\n" + ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" + ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n" + ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n" + ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 42f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "42:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "43:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 50f\n" + "cmp x27, #0x4\n" + "blt 46f\n" + "44:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x24], #0x4\n" + "tbnz %x[flags], #31, 45f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "45:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x20]\n" + "ldr q4, [x11, #0x30]\n" + ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "sub x27, x27, #0x4\n" + ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "cmp x27, #0x4\n" + ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" + "bge 44b\n" + "cbz x27, 50f\n" + "46:" // Height 2: Multiply loop: Skip odd blocks + "tbz x27, #1, 47f\n" + "ldr h0, [x26], #0x2\n" + "ldr h1, [x24], #0x2\n" + "tbz x27, #0, 48f\n" + "ld1 { v0.b }[2], [x26]\n" + "ld1 { v1.b }[2], [x24]\n" + "b 48f\n" + "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "48:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 49f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "49:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" + "ldr q7, [x11, #0x20]\n" + "ldr q8, [x11, #0x30]\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n" + "50:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 36b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbnz %x[flags], #31, 51f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x19]\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "neg v2.4s, v2.4s\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "51:" // Height 2: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "add v20.4s, v20.4s, v12.4s\n" + "ldr q3, [x10, #0x30]\n" + "add v21.4s, v21.4s, v12.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v22.4s, v22.4s, v12.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "add v23.4s, v23.4s, v12.4s\n" + "add x10, x10, #0x40\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "tbz %x[flags], #5, 52f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "and v9.16b, v21.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "52:" // Height 2: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 61f\n" + "tbz x12, #3, 56f\n" + "str d16, [x9], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x12, #2, 54f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "tbz x12, #1, 53f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[14], [x9]\n" + "st1 { v20.b }[14], [x25]\n" + "b 60f\n" + "53:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x12, #0, 60f\n" + "st1 { v16.b }[12], [x9]\n" + "st1 { v20.b }[12], [x25]\n" + "b 60f\n" + "54:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x12, #1, 55f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[10], [x9]\n" + "st1 { v20.b }[10], [x25]\n" + "b 60f\n" + "55:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x12, #0, 60f\n" + "st1 { v16.b }[8], [x9]\n" + "st1 { v20.b }[8], [x25]\n" + "b 60f\n" + "56:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x12, #2, 58f\n" + "str s16, [x9], #0x4\n" + "str s20, [x25], #0x4\n" + "tbz x12, #1, 57f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[6], [x9]\n" + "st1 { v20.b }[6], [x25]\n" + "b 60f\n" + "57:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x12, #0, 60f\n" + "st1 { v16.b }[4], [x9]\n" + "st1 { v20.b }[4], [x25]\n" + "b 60f\n" + "58:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x12, #1, 59f\n" + "str h16, [x9], #0x2\n" + "str h20, [x25], #0x2\n" + "tbz x12, #0, 60f\n" + "st1 { v16.b }[2], [x9]\n" + "st1 { v20.b }[2], [x25]\n" + "b 60f\n" + "59:" // Height 2: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "str b20, [x25, #0x0]\n" + "60:" // Height 2: Partial direct writeback: Done + "b 62f\n" + "61:" // Height 2: Full writeback + "str q16, [x9, #0x0]\n" + "str q20, [x25, #0x0]\n" + "add x9, x9, #0x10\n" + "add x25, x25, #0x10\n" + "62:" // Height 2: Writeback done + "subs x12, x12, #0x10\n" + "bgt 34b\n" + "b 126f\n" + "63:" // Height 3 + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 64f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "b 65f\n" + "64:" // Height 3: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "65:" // Height 3: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "66:" // Height 3: setup done + "mov x28, #0x0\n" + "67:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 68f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "cbnz x28, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 69f\n" + "68:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "69:" // Height 3: input setup done + "cmp x27, #0x10\n" + "blt 74f\n" + "cmp x27, #0x20\n" + "blt 72f\n" + "70:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x20]\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x11, #0x30]\n" + "ldr q8, [x11, #0x40]\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x80]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + "ldr q10, [x11, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 71f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "71:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x20\n" + "prfm pldl1keep, [x22, #0x80]\n" + "bge 70b\n" + "72:" // Height 3: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x20]\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + "ldr q9, [x11, #0x30]\n" + "ldr q10, [x11, #0x40]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x80]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n" + ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n" + "ldr q5, [x11, #0xd0]\n" + ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" + ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n" + ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n" + ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n" + ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n" + ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 73f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "73:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "74:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 81f\n" + "cmp x27, #0x4\n" + "blt 77f\n" + "75:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x22], #0x4\n" + "tbnz %x[flags], #31, 76f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "76:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x20]\n" + ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q4, [x11, #0x30]\n" + "sub x27, x27, #0x4\n" + ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "cmp x27, #0x4\n" + ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" + "bge 75b\n" + "cbz x27, 81f\n" + "77:" // Height 3: Multiply loop: Skip odd blocks + "tbz x27, #1, 78f\n" + "ldr h0, [x26], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x22], #0x2\n" + "tbz x27, #0, 79f\n" + "ld1 { v0.b }[2], [x26]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x22]\n" + "b 79f\n" + "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "79:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 80f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "80:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" + "ldr q7, [x11, #0x20]\n" + ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n" + "ldr q8, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n" + "81:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 67b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbnz %x[flags], #31, 82f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x19]\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "neg v3.4s, v3.4s\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "82:" // Height 3: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "add v20.4s, v20.4s, v12.4s\n" + "ldr q3, [x10, #0x30]\n" + "add v21.4s, v21.4s, v12.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v22.4s, v22.4s, v12.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "add v23.4s, v23.4s, v12.4s\n" + "add x10, x10, #0x40\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 83f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "and v9.16b, v21.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "and v7.16b, v26.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v8.16b, v27.16b, v0.16b\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "83:" // Height 3: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 92f\n" + "tbz x12, #3, 87f\n" + "str d16, [x9], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x12, #2, 85f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "tbz x12, #1, 84f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[14], [x9]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x23]\n" + "b 91f\n" + "84:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x12, #0, 91f\n" + "st1 { v16.b }[12], [x9]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x23]\n" + "b 91f\n" + "85:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x12, #1, 86f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[10], [x9]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x23]\n" + "b 91f\n" + "86:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x12, #0, 91f\n" + "st1 { v16.b }[8], [x9]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x23]\n" + "b 91f\n" + "87:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x12, #2, 89f\n" + "str s16, [x9], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "tbz x12, #1, 88f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[6], [x9]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x23]\n" + "b 91f\n" + "88:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x12, #0, 91f\n" + "st1 { v16.b }[4], [x9]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x23]\n" + "b 91f\n" + "89:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x12, #1, 90f\n" + "str h16, [x9], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x23], #0x2\n" + "tbz x12, #0, 91f\n" + "st1 { v16.b }[2], [x9]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x23]\n" + "b 91f\n" + "90:" // Height 3: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x23, #0x0]\n" + "91:" // Height 3: Partial direct writeback: Done + "b 93f\n" + "92:" // Height 3: Full writeback + "str q16, [x9, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x23, #0x0]\n" + "add x9, x9, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "93:" // Height 3: Writeback done + "subs x12, x12, #0x10\n" + "bgt 65b\n" + "b 126f\n" + "94:" // Height 4 + "movi v11.4s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "movi v12.4s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "movi v13.4s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "tbz %x[flags], #2, 95f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "ldr x21, [%x[output_ptr], #0x18]\n" + "add x25, x25, x19\n" + "add %x[output_ptr], %x[output_ptr], #0x20\n" + "add x23, x23, x19\n" + "add x21, x21, x19\n" + "b 96f\n" + "95:" // Height 4: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "add x21, x23, x19\n" + "add %x[output_ptr], x21, x19\n" + "96:" // Height 4: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "97:" // Height 4: setup done + "mov x28, #0x0\n" + "98:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 99f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x28, 100f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 100f\n" + "99:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "100:" // Height 4: input setup done + "cmp x27, #0x10\n" + "blt 105f\n" + "cmp x27, #0x20\n" + "blt 103f\n" + "101:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x11, #0x0]\n" + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x10]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x20]\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x11, #0x30]\n" + ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" + "ldr q8, [x11, #0x40]\n" + "ldr q9, [x11, #0x50]\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q10, [x11, #0x60]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q4, [x11, #0x70]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" + "ldr q5, [x11, #0x80]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x11, #0x90]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x11, #0xa0]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" + "ldr q8, [x11, #0xb0]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" + "ldr q9, [x11, #0xc0]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" + "ldr q10, [x11, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" + "ldr q4, [x11, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" + "ldr q5, [x11, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 102f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "102:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x20\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bge 101b\n" + "103:" // Height 4: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x11, #0x10]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + "ldr q8, [x11, #0x20]\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + "ldr q9, [x11, #0x30]\n" + ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n" + "ldr q10, [x11, #0x40]\n" + "ldr q4, [x11, #0x50]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q5, [x11, #0x60]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x11, #0x70]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x11, #0x80]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n" + "ldr q8, [x11, #0x90]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [x11, #0xa0]\n" + ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n" + ".inst 0x6fa3e15c // udot v28.4s, v10.16b, v3.4b[1]\n" + "ldr q10, [x11, #0xb0]\n" + ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n" + ".inst 0x6fa3e09d // udot v29.4s, v4.16b, v3.4b[1]\n" + "ldr q4, [x11, #0xc0]\n" + ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n" + ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n" + ".inst 0x6fa3e0be // udot v30.4s, v5.16b, v3.4b[1]\n" + "ldr q5, [x11, #0xd0]\n" + ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0df // udot v31.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x11, #0xe0]\n" + ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8fc // udot v28.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x11, #0xf0]\n" + ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" + "add x11, x11, #0x100\n" + ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f83e91d // udot v29.4s, v8.16b, v3.4b[2]\n" + ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" + ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n" + ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n" + ".inst 0x6f83e93e // udot v30.4s, v9.16b, v3.4b[2]\n" + ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" + ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n" + ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n" + ".inst 0x6f83e95f // udot v31.4s, v10.16b, v3.4b[2]\n" + ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa3e89c // udot v28.4s, v4.16b, v3.4b[3]\n" + ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6fa3e8bd // udot v29.4s, v5.16b, v3.4b[3]\n" + ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8de // udot v30.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8ff // udot v31.4s, v7.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 104f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "104:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "105:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 112f\n" + "cmp x27, #0x4\n" + "blt 108f\n" + "106:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x22], #0x4\n" + "ldr s3, [x20], #0x4\n" + "tbnz %x[flags], #31, 107f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "107:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q8, [x11, #0x0]\n" + ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q9, [x11, #0x10]\n" + ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q10, [x11, #0x20]\n" + ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q4, [x11, #0x30]\n" + ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" + "sub x27, x27, #0x4\n" + "add x11, x11, #0x40\n" + ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "cmp x27, #0x4\n" + ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" + ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" + "bge 106b\n" + "cbz x27, 112f\n" + "108:" // Height 4: Multiply loop: Skip odd blocks + "tbz x27, #1, 109f\n" + "ldr h0, [x26], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x22], #0x2\n" + "ldr h3, [x20], #0x2\n" + "tbz x27, #0, 110f\n" + "ld1 { v0.b }[2], [x26]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x22]\n" + "ld1 { v3.b }[2], [x20]\n" + "b 110f\n" + "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x26, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x22, #0x0]\n" + "ldr b3, [x20, #0x0]\n" + "110:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 111f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "111:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q5, [x11, #0x0]\n" + ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x11, #0x10]\n" + ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" + "ldr q7, [x11, #0x20]\n" + ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n" + "ldr q8, [x11, #0x30]\n" + ".inst 0x6f83e0bc // udot v28.4s, v5.16b, v3.4b[0]\n" + "add x11, x11, #0x40\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0fe // udot v30.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f83e11f // udot v31.4s, v8.16b, v3.4b[0]\n" + "112:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x19\n" + "bne 98b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 113f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "neg v4.4s, v4.4s\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "113:" // Height 4: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v17.4s, v17.4s, v11.4s\n" + "ldr q0, [x10, #0x0]\n" + "add v18.4s, v18.4s, v11.4s\n" + "ldr q1, [x10, #0x10]\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q2, [x10, #0x20]\n" + "add v20.4s, v20.4s, v12.4s\n" + "ldr q3, [x10, #0x30]\n" + "add v21.4s, v21.4s, v12.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v22.4s, v22.4s, v12.4s\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x19]\n" + "add v23.4s, v23.4s, v12.4s\n" + "add x10, x10, #0x40\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v28.4s, v28.4s, v14.4s\n" + "add v29.4s, v29.4s, v14.4s\n" + "add v30.4s, v30.4s, v14.4s\n" + "add v31.4s, v31.4s, v14.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v4.4s\n" + "sqrdmulh v30.4s, v30.4s, v4.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "tbz %x[flags], #5, 114f\n" + "and v4.16b, v16.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "and v9.16b, v21.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v5.16b, v24.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "and v7.16b, v26.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v8.16b, v27.16b, v0.16b\n" + "and v9.16b, v28.16b, v0.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "and v10.16b, v29.16b, v0.16b\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "and v4.16b, v30.16b, v0.16b\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "and v5.16b, v31.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v9.4s\n" + "sqadd v29.4s, v29.4s, v10.4s\n" + "sqadd v30.4s, v30.4s, v4.4s\n" + "sqadd v31.4s, v31.4s, v5.4s\n" + "114:" // Height 4: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x19]\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "add x19, %x[qp], %[minval]\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "ld1r { v5.4s }, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "ld1r { v6.4s }, [x19]\n" + "cmp x12, #0x10\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "srshl v30.4s, v30.4s, v0.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "srshl v31.4s, v31.4s, v0.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v28.8h, v28.8h, v29.8h\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 123f\n" + "tbz x12, #3, 118f\n" + "str d16, [x9], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x12, #2, 116f\n" + "st1 { v16.s }[2], [x9], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" + "tbz x12, #1, 115f\n" + "st1 { v16.h }[6], [x9], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x23], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[14], [x9]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x23]\n" + "st1 { v28.b }[14], [x21]\n" + "b 122f\n" + "115:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x12, #0, 122f\n" + "st1 { v16.b }[12], [x9]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x23]\n" + "st1 { v28.b }[12], [x21]\n" + "b 122f\n" + "116:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x12, #1, 117f\n" + "st1 { v16.h }[4], [x9], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x23], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[10], [x9]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x23]\n" + "st1 { v28.b }[10], [x21]\n" + "b 122f\n" + "117:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x12, #0, 122f\n" + "st1 { v16.b }[8], [x9]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x23]\n" + "st1 { v28.b }[8], [x21]\n" + "b 122f\n" + "118:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x12, #2, 120f\n" + "str s16, [x9], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x23], #0x4\n" + "str s28, [x21], #0x4\n" + "tbz x12, #1, 119f\n" + "st1 { v16.h }[2], [x9], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x23], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[6], [x9]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x23]\n" + "st1 { v28.b }[6], [x21]\n" + "b 122f\n" + "119:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x12, #0, 122f\n" + "st1 { v16.b }[4], [x9]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x23]\n" + "st1 { v28.b }[4], [x21]\n" + "b 122f\n" + "120:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x12, #1, 121f\n" + "str h16, [x9], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x23], #0x2\n" + "str h28, [x21], #0x2\n" + "tbz x12, #0, 122f\n" + "st1 { v16.b }[2], [x9]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x23]\n" + "st1 { v28.b }[2], [x21]\n" + "b 122f\n" + "121:" // Height 4: Partial direct writeback: partial_1_0 + "str b16, [x9, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x23, #0x0]\n" + "str b28, [x21, #0x0]\n" + "122:" // Height 4: Partial direct writeback: Done + "b 124f\n" + "123:" // Height 4: Full writeback + "str q16, [x9, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x23, #0x0]\n" + "str q28, [x21, #0x0]\n" + "add x9, x9, #0x10\n" + "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" + "add x21, x21, #0x10\n" + "124:" // Height 4: Writeback done + "subs x12, x12, #0x10\n" + "bgt 96b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 126f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 125f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "125:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "126:" // Exit + + : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp deleted file mode 100644 index e5a88b4519..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); -void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - -class hybrid_u8u32_dot_16x4 -{ -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return 16; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_hybrid_u8u32_dot_16x4; - - hybrid_u8u32_dot_16x4(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::A55r1) { - kernel = a64_hybrid_u8u32_dot_16x4_a55; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp deleted file mode 100644 index 735e5fd45a..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp +++ /dev/null @@ -1,2434 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 16) / 32) - 1; - K -= loops_count * 32; - const long regs_count = (K / 16) - 1; - K -= (regs_count + 1) * 16; - const long blocks_count = K / 4; - const long odds_count = K - (blocks_count * 4); - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "temploadreg0 .req X2\n" - "temploadreg1 .req X3\n" - "temploadreg2 .req X4\n" - "temploadreg3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v19.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v20.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v21.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v22.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v23.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "subs %[loops], %[loops], #0x1\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - "ins v15.d[1], temploadreg3\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - "ldr d8, [%[b_ptr0]]\n" - "ins v0.d[1], temploadreg0\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "ins v1.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "ins v11.d[1], temploadreg3\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "b.ne 3b\n" - "2:\n" - "ins v14.d[1], temploadreg2\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "temploadreg0 .req X4\n" - "temploadreg1 .req X5\n" - "temploadreg2 .req X6\n" - "temploadreg3 .req X7\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v20.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v21.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v22.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v23.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v24.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v25.4s, #0\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "movi v26.4s, #0\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "movi v27.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ins v14.d[1], temploadreg2\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "ins v14.d[1], temploadreg2\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - "ins v0.d[1], temploadreg0\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - "ins v1.d[1], temploadreg1\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "ins v15.d[1], temploadreg3\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - "ldr d2, [a_ptr2, #-0x10]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - "ldr temploadreg2, [a_ptr2, #-0x8]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - "ins v8.d[1], temploadreg0\n" - "ins v2.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - "ins v9.d[1], temploadreg1\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - "ins v10.d[1], temploadreg2\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - "ins v11.d[1], temploadreg3\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "ins v14.d[1], temploadreg2\n" - "b.ne 3b\n" - "2:\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "temploadreg0 .req X6\n" - "temploadreg1 .req X7\n" - "temploadreg2 .req X8\n" - "temploadreg3 .req X9\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q3, [a_ptr3]\n" - "movi v20.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v21.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v22.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v23.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v24.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v25.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v26.4s, #0\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "movi v27.4s, #0\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "movi v28.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "movi v29.4s, #0\n" - "ins v14.d[1], temploadreg2\n" - "movi v30.4s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "movi v31.4s, #0\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q28, [c_ptr3]\n" - "ldr q29, [c_ptr3, #0x10]\n" - "ldr q30, [c_ptr3, #0x20]\n" - "ldr q31, [c_ptr3, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "ins v14.d[1], temploadreg2\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d7, [a_ptr3]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr temploadreg3, [a_ptr3, #0x8]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ins v7.d[1], temploadreg3\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - "ldr d0, [%[a_ptr0], #-0x10]\n" - ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - "ins v0.d[1], temploadreg0\n" - ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - "ldr d1, [a_ptr1, #-0x10]\n" - ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #-0x8]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - "ins v1.d[1], temploadreg1\n" - ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - "ldr d2, [a_ptr2, #-0x10]\n" - ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" - "ldr temploadreg2, [a_ptr2, #-0x8]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - "ins v2.d[1], temploadreg2\n" - ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - "ldr d3, [a_ptr3, #-0x10]\n" - ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" - "ldr temploadreg3, [a_ptr3, #-0x8]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - "ins v3.d[1], temploadreg3\n" - ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" - "ins v12.d[1], temploadreg0\n" - "ins v13.d[1], temploadreg1\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - "ins v14.d[1], temploadreg2\n" - "b.ne 3b\n" - "2:\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "ins v15.d[1], temploadreg3\n" - "cbz %[regs], 4f\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr d4, [%[a_ptr0]]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg0, [%[a_ptr0], #0x8]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr d5, [a_ptr1]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - "ldr temploadreg1, [a_ptr1, #0x8]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr d6, [a_ptr2]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr temploadreg2, [a_ptr2, #0x8]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr d7, [a_ptr3]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - "ldr temploadreg3, [a_ptr3, #0x8]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ins v4.d[1], temploadreg0\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ins v5.d[1], temploadreg1\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "ins v6.d[1], temploadreg2\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ins v7.d[1], temploadreg3\n" - ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr d8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" - ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" - "ldr d9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" - ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" - "ldr d10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" - "ldr d11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" - ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" - "ldr d12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" - "ldr d13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" - "ldr d14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" - "ldr d15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "add a_ptr3, a_ptr3, #0x10\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x8]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x18]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr temploadreg2, [%[b_ptr0], #0x28]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - "ldr d8, [%[b_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr temploadreg3, [%[b_ptr0], #0x38]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ins v8.d[1], temploadreg0\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - "ldr d9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr temploadreg0, [%[b_ptr0], #0x48]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ins v9.d[1], temploadreg1\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - "ldr d10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "ldr temploadreg1, [%[b_ptr0], #0x58]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ins v10.d[1], temploadreg2\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "ldr d11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "ldr temploadreg2, [%[b_ptr0], #0x68]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ins v11.d[1], temploadreg3\n" - ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" - "ldr d12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "ldr temploadreg3, [%[b_ptr0], #0x78]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ins v12.d[1], temploadreg0\n" - ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" - "ldr d13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ins v13.d[1], temploadreg1\n" - ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" - "ldr d14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ins v14.d[1], temploadreg2\n" - ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" - "ldr d15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ins v15.d[1], temploadreg3\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr s3, [a_ptr3]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "ld1 {v3.b}[0], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "ld1 {v3.b}[1], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "ld1 {v3.b}[2], [a_ptr3]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - "str q28, [c_ptr3]\n" - "str q29, [c_ptr3, #0x10]\n" - "str q30, [c_ptr3, #0x20]\n" - "str q31, [c_ptr3, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq temploadreg0\n" - ".unreq temploadreg1\n" - ".unreq temploadreg2\n" - ".unreq temploadreg3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy - -#include "arm_gemm.hpp" -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 16) / 32) - 1; - K -= loops_count * 32; - const long regs_count = (K / 16) - 1; - K -= (regs_count + 1) * 16; - const long blocks_count = K / 4; - const long odds_count = K - (blocks_count * 4); - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v19.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v20.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v21.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v22.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v23.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "cbz %[regs], 4f\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v20.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v21.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v22.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v23.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v24.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v25.4s, #0\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "movi v26.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "movi v27.4s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "cbz %[regs], 4f\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "cbnz %[accumulate], 1f\n" - "movi v16.4s, #0\n" - "ldr q0, [%[a_ptr0]]\n" - "movi v17.4s, #0\n" - "ldr q1, [a_ptr1]\n" - "movi v18.4s, #0\n" - "ldr q2, [a_ptr2]\n" - "movi v19.4s, #0\n" - "ldr q3, [a_ptr3]\n" - "movi v20.4s, #0\n" - "ldr q8, [%[b_ptr0]]\n" - "movi v21.4s, #0\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "movi v22.4s, #0\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "movi v23.4s, #0\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "movi v24.4s, #0\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "movi v25.4s, #0\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "movi v26.4s, #0\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "movi v27.4s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "movi v28.4s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "movi v29.4s, #0\n" - "add a_ptr2, a_ptr2, #0x10\n" - "movi v30.4s, #0\n" - "add a_ptr3, a_ptr3, #0x10\n" - "movi v31.4s, #0\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ldr q16, [%[c_ptr0]]\n" - "ldr q17, [%[c_ptr0], #0x10]\n" - "ldr q18, [%[c_ptr0], #0x20]\n" - "ldr q19, [%[c_ptr0], #0x30]\n" - "ldr q20, [c_ptr1]\n" - "ldr q21, [c_ptr1, #0x10]\n" - "ldr q22, [c_ptr1, #0x20]\n" - "ldr q23, [c_ptr1, #0x30]\n" - "ldr q24, [c_ptr2]\n" - "ldr q25, [c_ptr2, #0x10]\n" - "ldr q26, [c_ptr2, #0x20]\n" - "ldr q27, [c_ptr2, #0x30]\n" - "ldr q28, [c_ptr3]\n" - "ldr q29, [c_ptr3, #0x10]\n" - "ldr q30, [c_ptr3, #0x20]\n" - "ldr q31, [c_ptr3, #0x30]\n" - "ldr q0, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ldr q1, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ldr q2, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ldr q3, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q7, [a_ptr3]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - "ldr q0, [%[a_ptr0], #-0x10]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - "ldr q1, [a_ptr1, #-0x10]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - "ldr q2, [a_ptr2, #-0x10]\n" - ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - "ldr q3, [a_ptr3, #-0x10]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" - "b.ne 3b\n" - "2:\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - "prfm PSTL1KEEP, [%[c_ptr0]]\n" - "prfm PSTL1KEEP, [c_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2]\n" - "prfm PSTL1KEEP, [c_ptr3]\n" - "cbz %[regs], 4f\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q4, [%[a_ptr0]]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q5, [a_ptr1]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q6, [a_ptr2]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - "ldr q7, [a_ptr3]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - "add a_ptr1, a_ptr1, #0x10\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "add a_ptr2, a_ptr2, #0x10\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - "add a_ptr3, a_ptr3, #0x10\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x100\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" - "ldr q8, [%[b_ptr0], #-0x80]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" - "ldr q9, [%[b_ptr0], #-0x70]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" - "ldr q10, [%[b_ptr0], #-0x60]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" - "ldr q11, [%[b_ptr0], #-0x50]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" - "ldr q12, [%[b_ptr0], #-0x40]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" - "ldr q13, [%[b_ptr0], #-0x30]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" - "ldr q14, [%[b_ptr0], #-0x20]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" - "ldr q15, [%[b_ptr0], #-0x10]\n" - ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" - ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" - ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" - ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" - ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" - ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" - ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" - ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" - ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" - ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" - ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" - ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" - ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" - ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" - ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" - ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" - ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" - ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" - ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" - ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" - ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" - ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" - ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" - ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" - ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" - ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" - ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" - ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" - ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" - ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" - ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" - ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" - ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" - ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" - ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" - ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" - ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" - ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" - ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" - ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" - ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" - ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" - ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" - ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" - ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" - ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" - ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" - ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" - ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" - ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" - ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" - ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" - ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" - ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" - ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - "ldr q8, [%[b_ptr0]]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" - ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" - ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" - ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" - "ldr q12, [%[b_ptr0], #0x40]\n" - ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" - ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" - ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" - ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" - "ldr q13, [%[b_ptr0], #0x50]\n" - ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" - ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" - ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" - ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" - "ldr q14, [%[b_ptr0], #0x60]\n" - ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" - ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" - ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" - ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" - "ldr q15, [%[b_ptr0], #0x70]\n" - ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" - "add %[b_ptr0], %[b_ptr0], #0x80\n" - ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" - ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" - ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" - ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" - ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" - ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" - ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" - ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" - ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" - ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" - ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" - ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" - ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" - ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" - ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" - ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" - ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" - ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" - ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" - ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" - ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" - ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" - ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" - "5:\n" - "cbz %[blocks], 6f\n" - "7:\n" - "ldr q8, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr s0, [%[a_ptr0]]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "add %[a_ptr0], %[a_ptr0], #0x4\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - "add %[b_ptr0], %[b_ptr0], #0x40\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr s1, [a_ptr1]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "add a_ptr1, a_ptr1, #0x4\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "ldr s2, [a_ptr2]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "add a_ptr2, a_ptr2, #0x4\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "ldr s3, [a_ptr3]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "add a_ptr3, a_ptr3, #0x4\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "b.ne 7b\n" - "6:\n" - "cbz %[odds], 8f\n" - "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[0], [a_ptr1], #1\n" - "ld1 {v2.b}[0], [a_ptr2], #1\n" - "ld1 {v3.b}[0], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" - "ld1 {v1.b}[1], [a_ptr1], #1\n" - "ld1 {v2.b}[1], [a_ptr2], #1\n" - "ld1 {v3.b}[1], [a_ptr3], #1\n" - "subs %[odds], %[odds], #0x1\n" - "b.eq 9f\n" - "ld1 {v0.b}[2], [%[a_ptr0]]\n" - "ld1 {v1.b}[2], [a_ptr1]\n" - "ld1 {v2.b}[2], [a_ptr2]\n" - "ld1 {v3.b}[2], [a_ptr3]\n" - "9:\n" - "ldr q8, [%[b_ptr0]]\n" - "ldr q9, [%[b_ptr0], #0x10]\n" - "ldr q10, [%[b_ptr0], #0x20]\n" - "ldr q11, [%[b_ptr0], #0x30]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" - ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" - ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" - ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" - "8:\n" - "str q16, [%[c_ptr0]]\n" - "str q17, [%[c_ptr0], #0x10]\n" - "str q18, [%[c_ptr0], #0x20]\n" - "str q19, [%[c_ptr0], #0x30]\n" - "add %[c_ptr0], %[c_ptr0], #0x40\n" - "str q20, [c_ptr1]\n" - "str q21, [c_ptr1, #0x10]\n" - "str q22, [c_ptr1, #0x20]\n" - "str q23, [c_ptr1, #0x30]\n" - "str q24, [c_ptr2]\n" - "str q25, [c_ptr2, #0x10]\n" - "str q26, [c_ptr2, #0x20]\n" - "str q27, [c_ptr2, #0x30]\n" - "str q28, [c_ptr3]\n" - "str q29, [c_ptr3, #0x10]\n" - "str q30, [c_ptr3, #0x20]\n" - "str q31, [c_ptr3, #0x30]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - if (use_result_buffer) { - for(int cy=0; cy, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const uint32_t *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_u8u32_dot_6x16( ARGLIST ); + +class cls_a64_hybrid_u8u32_dot_6x16 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 16; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_u8u32_dot_6x16; + + cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp new file mode 100644 index 0000000000..3c8654147a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp @@ -0,0 +1,3335 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_u8u32_dot_6x16 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const uint32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 176f\n" + "cmp %x[M], #0x4\n" + "bgt 141f\n" + "beq 106f\n" + "cmp %x[M], #0x2\n" + "bgt 71f\n" + "beq 36f\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "tbz %x[flags], #0, 13f\n" + "cmp x15, #0x10\n" + "bge 12f\n" + "tbz x15, #3, 7f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "tbz x15, #2, 5f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "tbz x15, #1, 4f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "tbz x15, #0, 11f\n" + "ld1 { v11.s }[2], [x13]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 11f\n" + "ldr s11, [x13, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x15, #1, 6f\n" + "ldr d10, [x13], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 11f\n" + "ld1 { v10.s }[2], [x13]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 11f\n" + "ldr s10, [x13, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x15, #2, 9f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "tbz x15, #1, 8f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "tbz x15, #0, 11f\n" + "ld1 { v9.s }[2], [x13]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 11f\n" + "ldr s9, [x13, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x15, #1, 10f\n" + "ldr d8, [x13], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 11f\n" + "ld1 { v8.s }[2], [x13]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "11:" // Height 1: Partial accumulate: Done + "sub x13, x13, x19\n" + "b 14f\n" + "12:" // Height 1: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "b 14f\n" + "13:" // Height 1: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "14:" // Height 1: setup done + "mov x12, #0x0\n" + "15:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x11, #0x10\n" + "blt 20f\n" + "cmp x11, #0x20\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + "sub x11, x11, #0x10\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + "cmp x11, #0x20\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "add x10, x10, #0x10\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x11, 25f\n" + "cmp x11, #0x4\n" + "blt 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x4\n" + "add x14, x14, #0x40\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "cmp x11, #0x4\n" + "bge 21b\n" + "cbz x11, 25f\n" + "22:" // Height 1: Multiply loop: Skip odd blocks + "tbz x11, #1, 23f\n" + "ldr h0, [x10], #0x2\n" + "tbz x11, #0, 24f\n" + "ld1 { v0.b }[2], [x10]\n" + "b 24f\n" + "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "24:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "25:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 15b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "cmp x15, #0x10\n" + "bge 34f\n" + "tbz x15, #3, 29f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "tbz x15, #2, 27f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "tbz x15, #1, 26f\n" + "str d11, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v11.s }[2], [x13]\n" + "b 33f\n" + "26:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x15, #0, 33f\n" + "str s11, [x13, #0x0]\n" + "b 33f\n" + "27:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x15, #1, 28f\n" + "str d10, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v10.s }[2], [x13]\n" + "b 33f\n" + "28:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x15, #0, 33f\n" + "str s10, [x13, #0x0]\n" + "b 33f\n" + "29:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x15, #2, 31f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "tbz x15, #1, 30f\n" + "str d9, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v9.s }[2], [x13]\n" + "b 33f\n" + "30:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x15, #0, 33f\n" + "str s9, [x13, #0x0]\n" + "b 33f\n" + "31:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x15, #1, 32f\n" + "str d8, [x13], #0x8\n" + "tbz x15, #0, 33f\n" + "st1 { v8.s }[2], [x13]\n" + "b 33f\n" + "32:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "33:" // Height 1: Partial direct writeback: Done + "b 35f\n" + "34:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "35:" // Height 1: Writeback done + "subs x15, x15, #0x10\n" + "bgt 3b\n" + "b 212f\n" + "36:" // Height 2 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 37f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 38f\n" + "37:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "38:" // Height 2: Column loop + "tbz %x[flags], #0, 48f\n" + "cmp x15, #0x10\n" + "bge 47f\n" + "tbz x15, #3, 42f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "tbz x15, #2, 40f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "tbz x15, #1, 39f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "tbz x15, #0, 46f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "b 46f\n" + "39:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 46f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "b 46f\n" + "40:" // Height 2: Partial accumulate: partial_2_8 + "tbz x15, #1, 41f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 46f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "b 46f\n" + "41:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 46f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "b 46f\n" + "42:" // Height 2: Partial accumulate: partial_4_0 + "tbz x15, #2, 44f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "tbz x15, #1, 43f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "tbz x15, #0, 46f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "b 46f\n" + "43:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 46f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "b 46f\n" + "44:" // Height 2: Partial accumulate: partial_2_0 + "tbz x15, #1, 45f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 46f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "b 46f\n" + "45:" // Height 2: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "46:" // Height 2: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "b 49f\n" + "47:" // Height 2: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "b 49f\n" + "48:" // Height 2: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "49:" // Height 2: setup done + "mov x12, #0x0\n" + "50:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 51f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "b 52f\n" + "51:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "52:" // Height 2: input setup done + "cmp x11, #0x10\n" + "blt 55f\n" + "cmp x11, #0x20\n" + "blt 54f\n" + "53:" // Height 2: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x10\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + "cmp x11, #0x20\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "bge 53b\n" + "54:" // Height 2: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "add x10, x10, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "55:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 60f\n" + "cmp x11, #0x4\n" + "blt 57f\n" + "56:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "sub x11, x11, #0x4\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "bge 56b\n" + "cbz x11, 60f\n" + "57:" // Height 2: Multiply loop: Skip odd blocks + "tbz x11, #1, 58f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "tbz x11, #0, 59f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "b 59f\n" + "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "59:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "60:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 50b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "bge 69f\n" + "tbz x15, #3, 64f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "tbz x15, #2, 62f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "tbz x15, #1, 61f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "b 68f\n" + "61:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x15, #0, 68f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "b 68f\n" + "62:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x15, #1, 63f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "b 68f\n" + "63:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x15, #0, 68f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "b 68f\n" + "64:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x15, #2, 66f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "tbz x15, #1, 65f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "b 68f\n" + "65:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x15, #0, 68f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "b 68f\n" + "66:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x15, #1, 67f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "tbz x15, #0, 68f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "b 68f\n" + "67:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "68:" // Height 2: Partial direct writeback: Done + "b 70f\n" + "69:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "70:" // Height 2: Writeback done + "subs x15, x15, #0x10\n" + "bgt 38b\n" + "b 212f\n" + "71:" // Height 3 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 72f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 73f\n" + "72:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "73:" // Height 3: Column loop + "tbz %x[flags], #0, 83f\n" + "cmp x15, #0x10\n" + "bge 82f\n" + "tbz x15, #3, 77f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "tbz x15, #2, 75f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "tbz x15, #1, 74f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "tbz x15, #0, 81f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "b 81f\n" + "74:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 81f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "b 81f\n" + "75:" // Height 3: Partial accumulate: partial_2_8 + "tbz x15, #1, 76f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 81f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "b 81f\n" + "76:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 81f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "b 81f\n" + "77:" // Height 3: Partial accumulate: partial_4_0 + "tbz x15, #2, 79f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "tbz x15, #1, 78f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "tbz x15, #0, 81f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "b 81f\n" + "78:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 81f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "b 81f\n" + "79:" // Height 3: Partial accumulate: partial_2_0 + "tbz x15, #1, 80f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 81f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "b 81f\n" + "80:" // Height 3: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "81:" // Height 3: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "b 84f\n" + "82:" // Height 3: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "b 84f\n" + "83:" // Height 3: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "84:" // Height 3: setup done + "mov x12, #0x0\n" + "85:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 86f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 87f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "b 87f\n" + "86:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "87:" // Height 3: input setup done + "cmp x11, #0x10\n" + "blt 90f\n" + "cmp x11, #0x20\n" + "blt 89f\n" + "88:" // Height 3: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "sub x11, x11, #0x10\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + "bge 88b\n" + "89:" // Height 3: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x28, x28, #0x10\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + "90:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 95f\n" + "cmp x11, #0x4\n" + "blt 92f\n" + "91:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + "cmp x11, #0x4\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "bge 91b\n" + "cbz x11, 95f\n" + "92:" // Height 3: Multiply loop: Skip odd blocks + "tbz x11, #1, 93f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "tbz x11, #0, 94f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "b 94f\n" + "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "94:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "95:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 85b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "bge 104f\n" + "tbz x15, #3, 99f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "tbz x15, #2, 97f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "tbz x15, #1, 96f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "b 103f\n" + "96:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x15, #0, 103f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "b 103f\n" + "97:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x15, #1, 98f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "b 103f\n" + "98:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x15, #0, 103f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "b 103f\n" + "99:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x15, #2, 101f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "tbz x15, #1, 100f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "b 103f\n" + "100:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x15, #0, 103f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "b 103f\n" + "101:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x15, #1, 102f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "tbz x15, #0, 103f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "b 103f\n" + "102:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "103:" // Height 3: Partial direct writeback: Done + "b 105f\n" + "104:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "105:" // Height 3: Writeback done + "subs x15, x15, #0x10\n" + "bgt 73b\n" + "b 212f\n" + "106:" // Height 4 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 107f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 108f\n" + "107:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "108:" // Height 4: Column loop + "tbz %x[flags], #0, 118f\n" + "cmp x15, #0x10\n" + "bge 117f\n" + "tbz x15, #3, 112f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "tbz x15, #2, 110f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "tbz x15, #1, 109f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "tbz x15, #0, 116f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "b 116f\n" + "109:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 116f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "b 116f\n" + "110:" // Height 4: Partial accumulate: partial_2_8 + "tbz x15, #1, 111f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 116f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "b 116f\n" + "111:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 116f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "b 116f\n" + "112:" // Height 4: Partial accumulate: partial_4_0 + "tbz x15, #2, 114f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "tbz x15, #1, 113f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "tbz x15, #0, 116f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "b 116f\n" + "113:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 116f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "b 116f\n" + "114:" // Height 4: Partial accumulate: partial_2_0 + "tbz x15, #1, 115f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 116f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "b 116f\n" + "115:" // Height 4: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "116:" // Height 4: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "b 119f\n" + "117:" // Height 4: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "b 119f\n" + "118:" // Height 4: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "119:" // Height 4: setup done + "mov x12, #0x0\n" + "120:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 121f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 122f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 122f\n" + "121:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "122:" // Height 4: input setup done + "cmp x11, #0x10\n" + "blt 125f\n" + "cmp x11, #0x20\n" + "blt 124f\n" + "123:" // Height 4: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + "bge 123b\n" + "124:" // Height 4: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + "125:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 130f\n" + "cmp x11, #0x4\n" + "blt 127f\n" + "126:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + "bge 126b\n" + "cbz x11, 130f\n" + "127:" // Height 4: Multiply loop: Skip odd blocks + "tbz x11, #1, 128f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "tbz x11, #0, 129f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "b 129f\n" + "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "129:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + "130:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 120b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "bge 139f\n" + "tbz x15, #3, 134f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "tbz x15, #2, 132f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "tbz x15, #1, 131f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "b 138f\n" + "131:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x15, #0, 138f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "b 138f\n" + "132:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x15, #1, 133f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "b 138f\n" + "133:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x15, #0, 138f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "b 138f\n" + "134:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x15, #2, 136f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "tbz x15, #1, 135f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "b 138f\n" + "135:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x15, #0, 138f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "b 138f\n" + "136:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x15, #1, 137f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x15, #0, 138f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "b 138f\n" + "137:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "138:" // Height 4: Partial direct writeback: Done + "b 140f\n" + "139:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "140:" // Height 4: Writeback done + "subs x15, x15, #0x10\n" + "bgt 108b\n" + "b 212f\n" + "141:" // Height 5 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 142f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 143f\n" + "142:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "143:" // Height 5: Column loop + "tbz %x[flags], #0, 153f\n" + "cmp x15, #0x10\n" + "bge 152f\n" + "tbz x15, #3, 147f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "tbz x15, #2, 145f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "tbz x15, #1, 144f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "tbz x15, #0, 151f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "b 151f\n" + "144:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 151f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "b 151f\n" + "145:" // Height 5: Partial accumulate: partial_2_8 + "tbz x15, #1, 146f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 151f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "b 151f\n" + "146:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 151f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "b 151f\n" + "147:" // Height 5: Partial accumulate: partial_4_0 + "tbz x15, #2, 149f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "tbz x15, #1, 148f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "tbz x15, #0, 151f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "b 151f\n" + "148:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 151f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "b 151f\n" + "149:" // Height 5: Partial accumulate: partial_2_0 + "tbz x15, #1, 150f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 151f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "b 151f\n" + "150:" // Height 5: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "151:" // Height 5: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "b 154f\n" + "152:" // Height 5: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "b 154f\n" + "153:" // Height 5: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "154:" // Height 5: setup done + "mov x12, #0x0\n" + "155:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 156f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 157f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 157f\n" + "156:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "157:" // Height 5: input setup done + "cmp x11, #0x10\n" + "blt 160f\n" + "cmp x11, #0x20\n" + "blt 159f\n" + "158:" // Height 5: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + "bge 158b\n" + "159:" // Height 5: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + "160:" // Height 5: Multiply loop: Main loop skip + "cbz x11, 165f\n" + "cmp x11, #0x4\n" + "blt 162f\n" + "161:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + "bge 161b\n" + "cbz x11, 165f\n" + "162:" // Height 5: Multiply loop: Skip odd blocks + "tbz x11, #1, 163f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x11, #0, 164f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 164f\n" + "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "164:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + "165:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 155b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "bge 174f\n" + "tbz x15, #3, 169f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "tbz x15, #2, 167f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "tbz x15, #1, 166f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "b 173f\n" + "166:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x15, #0, 173f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "b 173f\n" + "167:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x15, #1, 168f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "b 173f\n" + "168:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x15, #0, 173f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "b 173f\n" + "169:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x15, #2, 171f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "tbz x15, #1, 170f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "b 173f\n" + "170:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x15, #0, 173f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "b 173f\n" + "171:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x15, #1, 172f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x15, #0, 173f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "b 173f\n" + "172:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "173:" // Height 5: Partial direct writeback: Done + "b 175f\n" + "174:" // Height 5: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "175:" // Height 5: Writeback done + "subs x15, x15, #0x10\n" + "bgt 143b\n" + "b 212f\n" + "176:" // Height 6 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 177f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 178f\n" + "177:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "178:" // Height 6: Column loop + "tbz %x[flags], #0, 188f\n" + "cmp x15, #0x10\n" + "bge 187f\n" + "tbz x15, #3, 182f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x13], #0x10\n" + "ld1 { v13.4s }, [x9], #0x10\n" + "ld1 { v17.4s }, [x27], #0x10\n" + "ld1 { v21.4s }, [x25], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "ld1 { v29.4s }, [x21], #0x10\n" + "tbz x15, #2, 180f\n" + "ld1 { v10.4s }, [x13], #0x10\n" + "ld1 { v14.4s }, [x9], #0x10\n" + "ld1 { v18.4s }, [x27], #0x10\n" + "ld1 { v22.4s }, [x25], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "ld1 { v30.4s }, [x21], #0x10\n" + "tbz x15, #1, 179f\n" + "mov x19, #0x38\n" + "ldr d11, [x13], #0x8\n" + "ldr d15, [x9], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d23, [x25], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x15, #0, 186f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v15.s }[2], [x9]\n" + "ld1 { v19.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x23]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 186f\n" + "179:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x15, #0, 186f\n" + "ldr s11, [x13, #0x0]\n" + "ldr s15, [x9, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "ldr s23, [x25, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "b 186f\n" + "180:" // Height 6: Partial accumulate: partial_2_8 + "tbz x15, #1, 181f\n" + "ldr d10, [x13], #0x8\n" + "ldr d14, [x9], #0x8\n" + "ldr d18, [x27], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d30, [x21], #0x8\n" + "mov x19, #0x28\n" + "tbz x15, #0, 186f\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v14.s }[2], [x9]\n" + "ld1 { v18.s }[2], [x27]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x23]\n" + "ld1 { v30.s }[2], [x21]\n" + "b 186f\n" + "181:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x15, #0, 186f\n" + "ldr s10, [x13, #0x0]\n" + "ldr s14, [x9, #0x0]\n" + "ldr s18, [x27, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "b 186f\n" + "182:" // Height 6: Partial accumulate: partial_4_0 + "tbz x15, #2, 184f\n" + "ld1 { v8.4s }, [x13], #0x10\n" + "ld1 { v12.4s }, [x9], #0x10\n" + "ld1 { v16.4s }, [x27], #0x10\n" + "ld1 { v20.4s }, [x25], #0x10\n" + "ld1 { v24.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "tbz x15, #1, 183f\n" + "mov x19, #0x18\n" + "ldr d9, [x13], #0x8\n" + "ldr d13, [x9], #0x8\n" + "ldr d17, [x27], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x15, #0, 186f\n" + "ld1 { v9.s }[2], [x13]\n" + "ld1 { v13.s }[2], [x9]\n" + "ld1 { v17.s }[2], [x27]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v25.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x21]\n" + "b 186f\n" + "183:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x15, #0, 186f\n" + "ldr s9, [x13, #0x0]\n" + "ldr s13, [x9, #0x0]\n" + "ldr s17, [x27, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "ldr s29, [x21, #0x0]\n" + "b 186f\n" + "184:" // Height 6: Partial accumulate: partial_2_0 + "tbz x15, #1, 185f\n" + "ldr d8, [x13], #0x8\n" + "ldr d12, [x9], #0x8\n" + "ldr d16, [x27], #0x8\n" + "ldr d20, [x25], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d28, [x21], #0x8\n" + "mov x19, #0x8\n" + "tbz x15, #0, 186f\n" + "ld1 { v8.s }[2], [x13]\n" + "ld1 { v12.s }[2], [x9]\n" + "ld1 { v16.s }[2], [x27]\n" + "ld1 { v20.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x23]\n" + "ld1 { v28.s }[2], [x21]\n" + "b 186f\n" + "185:" // Height 6: Partial accumulate: partial_1_0 + "mov x19, #0x0\n" + "ldr s8, [x13, #0x0]\n" + "ldr s12, [x9, #0x0]\n" + "ldr s16, [x27, #0x0]\n" + "ldr s20, [x25, #0x0]\n" + "ldr s24, [x23, #0x0]\n" + "ldr s28, [x21, #0x0]\n" + "186:" // Height 6: Partial accumulate: Done + "sub x13, x13, x19\n" + "sub x9, x9, x19\n" + "sub x27, x27, x19\n" + "sub x25, x25, x19\n" + "sub x23, x23, x19\n" + "sub x21, x21, x19\n" + "b 189f\n" + "187:" // Height 6: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x9, #0x0]\n" + "ldr q13, [x9, #0x10]\n" + "ldr q14, [x9, #0x20]\n" + "ldr q15, [x9, #0x30]\n" + "ldr q16, [x27, #0x0]\n" + "ldr q17, [x27, #0x10]\n" + "ldr q18, [x27, #0x20]\n" + "ldr q19, [x27, #0x30]\n" + "ldr q20, [x25, #0x0]\n" + "ldr q21, [x25, #0x10]\n" + "ldr q22, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "ldr q28, [x21, #0x0]\n" + "ldr q29, [x21, #0x10]\n" + "ldr q30, [x21, #0x20]\n" + "ldr q31, [x21, #0x30]\n" + "b 189f\n" + "188:" // Height 6: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "189:" // Height 6: setup done + "mov x12, #0x0\n" + "190:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 191f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 192f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 192f\n" + "191:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "192:" // Height 6: input setup done + "cmp x11, #0x10\n" + "blt 195f\n" + "cmp x11, #0x20\n" + "blt 194f\n" + "193:" // Height 6: Multiply loop: Main loop head + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x11, x11, #0x10\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "cmp x11, #0x20\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n" + "bge 193b\n" + "194:" // Height 6: Multiply loop: Single iteration only + "sub x11, x11, #0x10\n" + "ldr q0, [x10, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "add x10, x10, #0x10\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q6, [x14, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n" + "ldr q7, [x14, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q6, [x14, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n" + "ldr q7, [x14, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x14, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n" + "ldr q7, [x14, #0xf0]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "add x14, x14, #0x100\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n" + "195:" // Height 6: Multiply loop: Main loop skip + "cbz x11, 200f\n" + "cmp x11, #0x4\n" + "blt 197f\n" + "196:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x10], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "sub x11, x11, #0x4\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "cmp x11, #0x4\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + "bge 196b\n" + "cbz x11, 200f\n" + "197:" // Height 6: Multiply loop: Skip odd blocks + "tbz x11, #1, 198f\n" + "ldr h0, [x10], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "tbz x11, #0, 199f\n" + "ld1 { v0.b }[2], [x10]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x26]\n" + "ld1 { v3.b }[2], [x24]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x20]\n" + "b 199f\n" + "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x10, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x26, #0x0]\n" + "ldr b3, [x24, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x20, #0x0]\n" + "199:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x14, #0x0]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x14, #0x10]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x14, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x14, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "add x14, x14, #0x40\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + "200:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x12, x12, #0x1\n" + "cmp x12, x19\n" + "bne 190b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "cmp x15, #0x10\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "bge 209f\n" + "tbz x15, #3, 204f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v9.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v17.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "tbz x15, #2, 202f\n" + "st1 { v10.4s }, [x13], #0x10\n" + "st1 { v14.4s }, [x9], #0x10\n" + "st1 { v18.4s }, [x27], #0x10\n" + "st1 { v22.4s }, [x25], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "st1 { v30.4s }, [x21], #0x10\n" + "tbz x15, #1, 201f\n" + "str d11, [x13], #0x8\n" + "str d15, [x9], #0x8\n" + "str d19, [x27], #0x8\n" + "str d23, [x25], #0x8\n" + "str d27, [x23], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v11.s }[2], [x13]\n" + "st1 { v15.s }[2], [x9]\n" + "st1 { v19.s }[2], [x27]\n" + "st1 { v23.s }[2], [x25]\n" + "st1 { v27.s }[2], [x23]\n" + "st1 { v31.s }[2], [x21]\n" + "b 208f\n" + "201:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x15, #0, 208f\n" + "str s11, [x13, #0x0]\n" + "str s15, [x9, #0x0]\n" + "str s19, [x27, #0x0]\n" + "str s23, [x25, #0x0]\n" + "str s27, [x23, #0x0]\n" + "str s31, [x21, #0x0]\n" + "b 208f\n" + "202:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x15, #1, 203f\n" + "str d10, [x13], #0x8\n" + "str d14, [x9], #0x8\n" + "str d18, [x27], #0x8\n" + "str d22, [x25], #0x8\n" + "str d26, [x23], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v10.s }[2], [x13]\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v18.s }[2], [x27]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v26.s }[2], [x23]\n" + "st1 { v30.s }[2], [x21]\n" + "b 208f\n" + "203:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x15, #0, 208f\n" + "str s10, [x13, #0x0]\n" + "str s14, [x9, #0x0]\n" + "str s18, [x27, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s26, [x23, #0x0]\n" + "str s30, [x21, #0x0]\n" + "b 208f\n" + "204:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x15, #2, 206f\n" + "st1 { v8.4s }, [x13], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v16.4s }, [x27], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "tbz x15, #1, 205f\n" + "str d9, [x13], #0x8\n" + "str d13, [x9], #0x8\n" + "str d17, [x27], #0x8\n" + "str d21, [x25], #0x8\n" + "str d25, [x23], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v9.s }[2], [x13]\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v17.s }[2], [x27]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v25.s }[2], [x23]\n" + "st1 { v29.s }[2], [x21]\n" + "b 208f\n" + "205:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x15, #0, 208f\n" + "str s9, [x13, #0x0]\n" + "str s13, [x9, #0x0]\n" + "str s17, [x27, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s29, [x21, #0x0]\n" + "b 208f\n" + "206:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x15, #1, 207f\n" + "str d8, [x13], #0x8\n" + "str d12, [x9], #0x8\n" + "str d16, [x27], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x15, #0, 208f\n" + "st1 { v8.s }[2], [x13]\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v16.s }[2], [x27]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v24.s }[2], [x23]\n" + "st1 { v28.s }[2], [x21]\n" + "b 208f\n" + "207:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x13, #0x0]\n" + "str s12, [x9, #0x0]\n" + "str s16, [x27, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s24, [x23, #0x0]\n" + "str s28, [x21, #0x0]\n" + "208:" // Height 6: Partial direct writeback: Done + "b 210f\n" + "209:" // Height 6: Full writeback + "str q8, [x13, #0x0]\n" + "str q9, [x13, #0x10]\n" + "str q10, [x13, #0x20]\n" + "str q11, [x13, #0x30]\n" + "str q12, [x9, #0x0]\n" + "str q13, [x9, #0x10]\n" + "str q14, [x9, #0x20]\n" + "str q15, [x9, #0x30]\n" + "str q16, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "str q18, [x27, #0x20]\n" + "str q19, [x27, #0x30]\n" + "str q20, [x25, #0x0]\n" + "str q21, [x25, #0x10]\n" + "str q22, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "str q28, [x21, #0x0]\n" + "str q29, [x21, #0x10]\n" + "str q30, [x21, #0x20]\n" + "str q31, [x21, #0x30]\n" + "add x13, x13, #0x40\n" + "add x9, x9, #0x40\n" + "add x27, x27, #0x40\n" + "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" + "add x21, x21, #0x40\n" + "210:" // Height 6: Writeback done + "subs x15, x15, #0x10\n" + "bgt 178b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 212f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 211f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "211:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "212:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp deleted file mode 100644 index 95fed86c2f..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../bfloat.hpp" -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int); -void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int); - -class interleaved_bf16fp32_dot_12x8 { -public: - typedef bfloat16 operand_type; - typedef float result_type; - - typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return 12; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 2; - } - - // Use the standard fixed size transforms. - StdTransformsFixed transforms = {}; - - kern_type kernel=a64_interleaved_bf16fp32_dot_12x8; - - interleaved_bf16fp32_dot_12x8(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::X1) { - kernel = a64_interleaved_bf16fp32_dot_12x8_x1; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp deleted file mode 100644 index 7ffae524dc..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; - - K /= 2; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb transforms = {}; + + kern_type kernel=a64_interleaved_bf16fp32_dot_8x12; + + cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp new file mode 100644 index 0000000000..92149a5579 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "../../bfloat.hpp" +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb transforms = {}; - - kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8; - - interleaved_bf16fp32_mmla_12x8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp deleted file mode 100644 index 7f0eff29af..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; - - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb transforms = {}; + + kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12; + + cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp new file mode 100644 index 0000000000..c476fcf171 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "../../bfloat.hpp" +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 4; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void a64_interleaved_s8s32_mmla_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int); - -class interleaved_s8s32_mmla_12x8 { -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return 12; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 8; - } - - // Use the standard fixed size transforms. - StdTransformsFixed transforms = {}; - - kern_type kernel=a64_interleaved_s8s32_mmla_12x8; - - interleaved_s8s32_mmla_12x8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp deleted file mode 100644 index 7953510aa7..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp +++ /dev/null @@ -1,395 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class cls_a64_interleaved_s8s32_mmla_8x12 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return 12; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + + kern_type kernel=a64_interleaved_s8s32_mmla_8x12; + + cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp new file mode 100644 index 0000000000..2093e75b8e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb -#include "../std_transforms_fixed.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void a64_interleaved_u8u32_mmla_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - -class interleaved_u8u32_mmla_12x8 { -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return 12; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 8; - } - - // Use the standard fixed size transforms. - StdTransformsFixed transforms = {}; - - kern_type kernel=a64_interleaved_u8u32_mmla_12x8; - - interleaved_u8u32_mmla_12x8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp deleted file mode 100644 index dcd15f0345..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp +++ /dev/null @@ -1,395 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class cls_a64_interleaved_u8u32_mmla_8x12 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return 12; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + StdTransformsFixed transforms_quantized = {}; + + kern_type kernel=a64_interleaved_u8u32_mmla_8x12; + + cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp new file mode 100644 index 0000000000..568e5d1098 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb transforms = {}; - - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 3.724, 1.416, 1.113 }; - - case CPUModel::A53: - return { 2.777, 0.987, 0.898 }; - - case CPUModel::A73: - return { 2.885, 1.429, 1.163 }; - - default: - return { 6.949, 4.149, 2.826 }; - } - } - - kern_type kernel=a64_sgemm_asimd_12x8; - - sgemm_12x8(const CPUInfo *ci) { - // Select specific kernel if available - switch(ci->get_cpu_model()) { - case CPUModel::A53: - kernel = a64_sgemm_asimd_12x8_a53; - break; - - case CPUModel::A55r0: - kernel = a64_sgemm_asimd_12x8_a55; - break; - - case CPUModel::A55r1: - kernel = a64_sgemm_asimd_12x8_a55r1; - break; - - case CPUModel::X1: - kernel = a64_sgemm_asimd_12x8_x1; - break; - - default: - /* Generic kernel is initialized by default. */ - break; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp deleted file mode 100644 index 5532485efb..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k_iters = ((K+1)/2) - 1; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -namespace arm_gemm { - -void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -namespace arm_gemm { - -void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb transforms = {}; + + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + switch (ci->get_cpu_model()) { + case CPUModel::A55r1: + return { 3.724, 1.416, 1.113 }; + + case CPUModel::A53: + return { 2.777, 0.987, 0.898 }; + + case CPUModel::A73: + return { 2.885, 1.429, 1.163 }; + + default: + return { 6.949, 4.149, 2.826 }; + } + } + + kern_type kernel=a64_sgemm_asimd_8x12; + + cls_a64_sgemm_8x12(const CPUInfo *ci) { + // Select specific kernel if available + switch(ci->get_cpu_model()) { + case CPUModel::A53: + kernel = a64_sgemm_asimd_8x12_a53; + break; + + case CPUModel::A55r0: + kernel = a64_sgemm_asimd_8x12_a55; + break; + + case CPUModel::A55r1: + kernel = a64_sgemm_asimd_8x12_a55r1; + break; + + case CPUModel::X1: + kernel = a64_sgemm_asimd_8x12_x1; + break; + + default: + /* Generic kernel is initialized by default. */ + break; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp new file mode 100644 index 0000000000..f4b6e7b70f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2017-2018 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_sgemm_asimd_8x12_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_sgemm_asimd_8x12_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_sgemm_asimd_8x12_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k_iters = ((K+1)/2) - 1; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_sgemm_asimd_8x12(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_sgemm_asimd_8x12_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6; - - smallK_hybrid_fp32_mla_4x6(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp deleted file mode 100644 index e2fec6af16..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp +++ /dev/null @@ -1,4612 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_fp32_mla_4x6(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(float); - const long ldcb = ldc * sizeof(float); - float nullbias[4]; - if (!bias) { - memset(nullbias, 0, (4 * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - for (int y0=0; y0 transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8; - - smallK_hybrid_fp32_mla_4x8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp deleted file mode 100644 index 11888bce74..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp +++ /dev/null @@ -1,3340 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(float); - const long ldcb = ldc * sizeof(float); - float nullbias[4]; - if (!bias) { - memset(nullbias, 0, (4 * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - for (int y0=0; y0 transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_smallK_hybrid_fp32_mla_6x4; + + cls_a64_smallK_hybrid_fp32_mla_6x4(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp new file mode 100644 index 0000000000..52548b462c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp @@ -0,0 +1,4612 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "arm_gemm.hpp" + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_fp32_mla_6x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(float); + const long ldcb = ldc * sizeof(float); + float nullbias[4]; + if (!bias) { + memset(nullbias, 0, (4 * sizeof(float))); + } + float minval = - static_cast(std::numeric_limits::infinity()); + float maxval = static_cast(std::numeric_limits::infinity()); + const float * const minptr = &minval; + const float * const maxptr = &maxval; + + switch(act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + minval = 0.0f; + break; + } + + for (int y0=0; y0 transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_smallK_hybrid_fp32_mla_8x4; + + cls_a64_smallK_hybrid_fp32_mla_8x4(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp new file mode 100644 index 0000000000..deaef27ee9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp @@ -0,0 +1,3340 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "arm_gemm.hpp" + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_fp32_mla_8x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(float); + const long ldcb = ldc * sizeof(float); + float nullbias[4]; + if (!bias) { + memset(nullbias, 0, (4 * sizeof(float))); + } + float minval = - static_cast(std::numeric_limits::infinity()); + float maxval = static_cast(std::numeric_limits::infinity()); + const float * const minptr = &minval; + const float * const maxptr = &maxval; + + switch(act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + minval = 0.0f; + break; + } + + for (int y0=0; y0 - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); -void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - -class smallK_hybrid_s8s32_dot_4x6 -{ -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 6; - } - - static unsigned int out_width() - { - return 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x6; - - smallK_hybrid_s8s32_dot_4x6(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::A55r1) { - kernel = a64_smallK_hybrid_s8s32_dot_4x6_a55; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp deleted file mode 100644 index 2d6d2f064c..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp +++ /dev/null @@ -1,4130 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(int8_t); - const long ldcb = ldc * sizeof(int32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(int8_t); - const long ldcb = ldc * sizeof(int32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); -void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - -class smallK_hybrid_s8s32_dot_4x8 -{ -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x8; - - smallK_hybrid_s8s32_dot_4x8(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::A55r1) { - kernel = a64_smallK_hybrid_s8s32_dot_4x8_a55; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp deleted file mode 100644 index 7135f2eee6..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp +++ /dev/null @@ -1,3088 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(int8_t); - const long ldcb = ldc * sizeof(int32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(int8_t); - const long ldcb = ldc * sizeof(int32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); +void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); + +class cls_a64_smallK_hybrid_s8s32_dot_6x4 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return false; + } + + static constexpr bool supports_activation() + { + return false; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_smallK_hybrid_s8s32_dot_6x4; + + cls_a64_smallK_hybrid_s8s32_dot_6x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_smallK_hybrid_s8s32_dot_6x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp new file mode 100644 index 0000000000..a9926602fc --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp @@ -0,0 +1,4854 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(int8_t); + const long ldcb = ldc * sizeof(int32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(int8_t); + const long ldcb = ldc * sizeof(int32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); +void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); + +class cls_a64_smallK_hybrid_s8s32_dot_8x4 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return false; + } + + static constexpr bool supports_activation() + { + return false; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_smallK_hybrid_s8s32_dot_8x4; + + cls_a64_smallK_hybrid_s8s32_dot_8x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_smallK_hybrid_s8s32_dot_8x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp new file mode 100644 index 0000000000..aba6e0d100 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp @@ -0,0 +1,3352 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(int8_t); + const long ldcb = ldc * sizeof(int32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(int8_t); + const long ldcb = ldc * sizeof(int32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); -void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - -class smallK_hybrid_u8u32_dot_4x6 -{ -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 6; - } - - static unsigned int out_width() - { - return 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x6; - - smallK_hybrid_u8u32_dot_4x6(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::A55r1) { - kernel = a64_smallK_hybrid_u8u32_dot_4x6_a55; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp deleted file mode 100644 index 02894d8327..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp +++ /dev/null @@ -1,4130 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(uint8_t); - const long ldcb = ldc * sizeof(uint32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(uint8_t); - const long ldcb = ldc * sizeof(uint32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 - -namespace arm_gemm -{ - -// Actual kernel implementations -void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); -void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - -class smallK_hybrid_u8u32_dot_4x8 -{ -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsFixed transforms = {}; - - // Default to the generic kernel - kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x8; - - smallK_hybrid_u8u32_dot_4x8(const CPUInfo *ci) - { - if (ci->get_cpu_model() == CPUModel::A55r1) { - kernel = a64_smallK_hybrid_u8u32_dot_4x8_a55; - } - } -}; - -} // namespace arm_gemm - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp deleted file mode 100644 index e70fb6955e..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp +++ /dev/null @@ -1,3088 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(uint8_t); - const long ldcb = ldc * sizeof(uint32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)4) - 1; - const long ldab = lda * sizeof(uint8_t); - const long ldcb = ldc * sizeof(uint32_t); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); +void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); + +class cls_a64_smallK_hybrid_u8u32_dot_6x4 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return false; + } + + static constexpr bool supports_activation() + { + return false; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_smallK_hybrid_u8u32_dot_6x4; + + cls_a64_smallK_hybrid_u8u32_dot_6x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_smallK_hybrid_u8u32_dot_6x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp new file mode 100644 index 0000000000..dddf4c5aa2 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp @@ -0,0 +1,4854 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(uint8_t); + const long ldcb = ldc * sizeof(uint32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(uint8_t); + const long ldcb = ldc * sizeof(uint32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); +void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); + +class cls_a64_smallK_hybrid_u8u32_dot_8x4 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return false; + } + + static constexpr bool supports_activation() + { + return false; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_smallK_hybrid_u8u32_dot_8x4; + + cls_a64_smallK_hybrid_u8u32_dot_8x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_smallK_hybrid_u8u32_dot_8x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp new file mode 100644 index 0000000000..fcb546f51e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp @@ -0,0 +1,3352 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(uint8_t); + const long ldcb = ldc * sizeof(uint32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { + const long loops_count = iceildiv(N, (int)4) - 1; + const long ldab = lda * sizeof(uint8_t); + const long ldcb = ldc * sizeof(uint32_t); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0(); + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return true; + } + + static constexpr bool supports_activation() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_gemv_fp32_mla_8VL; + + cls_sve_gemv_fp32_mla_8VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp new file mode 100644 index 0000000000..c62e31936c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp @@ -0,0 +1,1372 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_gemv_fp32_mla_8VL ( + const float *A_ptr, const float *B_ptr, float *output_ptr, + size_t N, size_t K, + const float *bias, Activation act, bool +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + const float *B_ptr = {}; + size_t output_offset = {}; + unsigned int input_initial_col = {}; + } ka; + + unsigned long flags=0; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "cntw x24\n" + "add x23, %x[N], x24\n" + "sub x23, x23, #0x1\n" + "udiv x23, x23, x24\n" + "mov x22, %x[bias]\n" + "1:" // Column loop + "cmp x23, #0x8\n" + "bge 50f\n" + "cmp x23, #0x6\n" + "bgt 43f\n" + "beq 36f\n" + "cmp x23, #0x4\n" + "bgt 29f\n" + "beq 22f\n" + "cmp x23, #0x2\n" + "bgt 15f\n" + "beq 8f\n" + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "whilelt p1.s, XZR, %x[N]\n" + "cbz x22, 2f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "addvl x22, x22, #1\n" + "b 3f\n" + "2:" // Width 1: no bias + "mov z24.b, #0x0\n" + "3:" // Width 1: setup done + "cmp x21, #0x4\n" + "ble 5f\n" + "4:" // Width 1: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x20, x20, #0x10\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x21, x21, #0x4\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z2.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "cmp x21, #0x4\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z3.s, z0.s[2]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z4.s, z0.s[3]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 4b\n" + "5:" // Width 1: Multiply loop: Single iteration only + "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z5.s, z0.s[0]\n" + "add x20, x20, #0x10\n" + "subs x21, x21, #0x1\n" + "ble 6f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z6.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "ble 6f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z7.s, z0.s[2]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "ble 6f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z8.s, z0.s[3]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "6:" // Width 1: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 7f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "7:" // Width 1: No activation + "st1w { z24.s }, p1, [%x[output_ptr]]\n" + "addvl %x[output_ptr], %x[output_ptr], #1\n" + "b 57f\n" + "8:" // Width 2 + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "sub x19, %x[N], x24\n" + "whilelt p1.s, XZR, x19\n" + "cbz x22, 9f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" + "addvl x22, x22, #2\n" + "b 10f\n" + "9:" // Width 2: no bias + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "10:" // Width 2: setup done + "cmp x21, #0x4\n" + "ble 12f\n" + "11:" // Width 2: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z2.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "sub x21, x21, #0x4\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z3.s, z0.s[1]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z25.s, z4.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "cmp x21, #0x4\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z5.s, z0.s[2]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z25.s, z6.s, z0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z7.s, z0.s[3]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z25.s, z8.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 11b\n" + "12:" // Width 2: Multiply loop: Single iteration only + "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z9.s, z0.s[0]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z10.s, z0.s[0]\n" + "subs x21, x21, #0x1\n" + "ble 13f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z11.s, z0.s[1]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z25.s, z12.s, z0.s[1]\n" + "ble 13f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z13.s, z0.s[2]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z25.s, z14.s, z0.s[2]\n" + "ble 13f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z25.s, z16.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "13:" // Width 2: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 14f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "14:" // Width 2: No activation + "st1w { z24.s }, p2, [%x[output_ptr]]\n" + "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n" + "addvl %x[output_ptr], %x[output_ptr], #2\n" + "b 57f\n" + "15:" // Width 3 + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "mov x19, #0x2\n" + "msub x19, x24, x19, %x[N]\n" + "whilelt p1.s, XZR, x19\n" + "cbz x22, 16f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" + "addvl x22, x22, #3\n" + "b 17f\n" + "16:" // Width 3: no bias + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "17:" // Width 3: setup done + "cmp x21, #0x4\n" + "ble 19f\n" + "18:" // Width 3: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "sub x21, x21, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z2.s, z0.s[0]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z26.s, z3.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "cmp x21, #0x4\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z4.s, z0.s[1]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z5.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z26.s, z6.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z7.s, z0.s[2]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z8.s, z0.s[2]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z26.s, z9.s, z0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z10.s, z0.s[3]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z11.s, z0.s[3]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z26.s, z12.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 18b\n" + "19:" // Width 3: Multiply loop: Single iteration only + "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "subs x21, x21, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z13.s, z0.s[0]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z14.s, z0.s[0]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z15.s, z0.s[0]\n" + "ble 20f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z17.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z18.s, z0.s[1]\n" + "ble 20f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z19.s, z0.s[2]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z20.s, z0.s[2]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z21.s, z0.s[2]\n" + "ble 20f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z22.s, z0.s[3]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z23.s, z0.s[3]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z26.s, z1.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "20:" // Width 3: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 21f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "21:" // Width 3: No activation + "st1w { z24.s }, p2, [%x[output_ptr]]\n" + "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" + "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n" + "addvl %x[output_ptr], %x[output_ptr], #3\n" + "b 57f\n" + "22:" // Width 4 + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "mov x19, #0x3\n" + "msub x19, x24, x19, %x[N]\n" + "whilelt p1.s, XZR, x19\n" + "cbz x22, 23f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" + "addvl x22, x22, #4\n" + "b 24f\n" + "23:" // Width 4: no bias + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "24:" // Width 4: setup done + "cmp x21, #0x4\n" + "ble 26f\n" + "25:" // Width 4: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "sub x21, x21, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z2.s, z0.s[0]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z3.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "cmp x21, #0x4\n" + "fmla z27.s, z4.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z5.s, z0.s[1]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z6.s, z0.s[1]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z7.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z27.s, z8.s, z0.s[1]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "fmla z24.s, z9.s, z0.s[2]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z10.s, z0.s[2]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z11.s, z0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z27.s, z12.s, z0.s[2]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z13.s, z0.s[3]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z14.s, z0.s[3]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z15.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z27.s, z16.s, z0.s[3]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 25b\n" + "26:" // Width 4: Multiply loop: Single iteration only + "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "subs x21, x21, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z17.s, z0.s[0]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z18.s, z0.s[0]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z19.s, z0.s[0]\n" + "fmla z27.s, z20.s, z0.s[0]\n" + "ble 27f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z22.s, z0.s[1]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z23.s, z0.s[1]\n" + "fmla z27.s, z1.s, z0.s[1]\n" + "ble 27f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z2.s, z0.s[2]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z3.s, z0.s[2]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z4.s, z0.s[2]\n" + "fmla z27.s, z5.s, z0.s[2]\n" + "ble 27f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z6.s, z0.s[3]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z7.s, z0.s[3]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z26.s, z8.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z27.s, z9.s, z0.s[3]\n" + "27:" // Width 4: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 28f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "28:" // Width 4: No activation + "st1w { z24.s }, p2, [%x[output_ptr]]\n" + "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" + "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" + "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n" + "addvl %x[output_ptr], %x[output_ptr], #4\n" + "b 57f\n" + "29:" // Width 5 + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "mov x19, #0x4\n" + "msub x19, x24, x19, %x[N]\n" + "whilelt p1.s, XZR, x19\n" + "cbz x22, 30f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" + "addvl x22, x22, #5\n" + "b 31f\n" + "30:" // Width 5: no bias + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "31:" // Width 5: setup done + "cmp x21, #0x4\n" + "ble 33f\n" + "32:" // Width 5: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "sub x21, x21, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z2.s, z0.s[0]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "cmp x21, #0x4\n" + "fmla z26.s, z3.s, z0.s[0]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z4.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z28.s, z5.s, z0.s[0]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "fmla z24.s, z6.s, z0.s[1]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z7.s, z0.s[1]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z8.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z27.s, z9.s, z0.s[1]\n" + "fmla z28.s, z10.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z11.s, z0.s[2]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z12.s, z0.s[2]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z13.s, z0.s[2]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z14.s, z0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z28.s, z15.s, z0.s[2]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z17.s, z0.s[3]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z18.s, z0.s[3]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z19.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z28.s, z20.s, z0.s[3]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 32b\n" + "33:" // Width 5: Multiply loop: Single iteration only + "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "subs x21, x21, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z21.s, z0.s[0]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z22.s, z0.s[0]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z23.s, z0.s[0]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z1.s, z0.s[0]\n" + "fmla z28.s, z2.s, z0.s[0]\n" + "ble 34f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z3.s, z0.s[1]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z4.s, z0.s[1]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z5.s, z0.s[1]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z6.s, z0.s[1]\n" + "fmla z28.s, z7.s, z0.s[1]\n" + "ble 34f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z8.s, z0.s[2]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z9.s, z0.s[2]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z10.s, z0.s[2]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z11.s, z0.s[2]\n" + "fmla z28.s, z12.s, z0.s[2]\n" + "ble 34f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z13.s, z0.s[3]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z14.s, z0.s[3]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z15.s, z0.s[3]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z16.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z28.s, z17.s, z0.s[3]\n" + "34:" // Width 5: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 35f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "35:" // Width 5: No activation + "st1w { z24.s }, p2, [%x[output_ptr]]\n" + "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" + "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" + "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" + "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n" + "addvl %x[output_ptr], %x[output_ptr], #5\n" + "b 57f\n" + "36:" // Width 6 + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "mov x19, #0x5\n" + "msub x19, x24, x19, %x[N]\n" + "whilelt p1.s, XZR, x19\n" + "cbz x22, 37f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n" + "addvl x22, x22, #6\n" + "b 38f\n" + "37:" // Width 6: no bias + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "38:" // Width 6: setup done + "cmp x21, #0x4\n" + "ble 40f\n" + "39:" // Width 6: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "sub x21, x21, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z2.s, z0.s[0]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "cmp x21, #0x4\n" + "fmla z26.s, z3.s, z0.s[0]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z4.s, z0.s[0]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z5.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z29.s, z6.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z7.s, z0.s[1]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z8.s, z0.s[1]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z9.s, z0.s[1]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z10.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z28.s, z11.s, z0.s[1]\n" + "fmla z29.s, z12.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z13.s, z0.s[2]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z14.s, z0.s[2]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z15.s, z0.s[2]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z16.s, z0.s[2]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z28.s, z17.s, z0.s[2]\n" + "fmla z29.s, z18.s, z0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z20.s, z0.s[3]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z21.s, z0.s[3]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z22.s, z0.s[3]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z29.s, z1.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 39b\n" + "40:" // Width 6: Multiply loop: Single iteration only + "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "subs x21, x21, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z2.s, z0.s[0]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z3.s, z0.s[0]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z4.s, z0.s[0]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z27.s, z5.s, z0.s[0]\n" + "fmla z28.s, z6.s, z0.s[0]\n" + "fmla z29.s, z7.s, z0.s[0]\n" + "ble 41f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z8.s, z0.s[1]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z9.s, z0.s[1]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z10.s, z0.s[1]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z11.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z12.s, z0.s[1]\n" + "fmla z29.s, z13.s, z0.s[1]\n" + "ble 41f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z15.s, z0.s[2]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z16.s, z0.s[2]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z17.s, z0.s[2]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z19.s, z0.s[2]\n" + "ble 41f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z20.s, z0.s[3]\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z21.s, z0.s[3]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z22.s, z0.s[3]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z23.s, z0.s[3]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z28.s, z1.s, z0.s[3]\n" + "fmla z29.s, z2.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "41:" // Width 6: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 42f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "fmin z29.s, p2/M, z29.s, z16.s\n" + "fmax z29.s, p2/M, z29.s, z17.s\n" + "42:" // Width 6: No activation + "st1w { z24.s }, p2, [%x[output_ptr]]\n" + "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" + "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" + "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" + "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n" + "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n" + "addvl %x[output_ptr], %x[output_ptr], #6\n" + "b 57f\n" + "43:" // Width 7 + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "mov x19, #0x6\n" + "msub x19, x24, x19, %x[N]\n" + "whilelt p1.s, XZR, x19\n" + "cbz x22, 44f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n" + "addvl x22, x22, #7\n" + "b 45f\n" + "44:" // Width 7: no bias + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "45:" // Width 7: setup done + "cmp x21, #0x4\n" + "ble 47f\n" + "46:" // Width 7: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "sub x21, x21, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z2.s, z0.s[0]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "cmp x21, #0x4\n" + "fmla z26.s, z3.s, z0.s[0]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z4.s, z0.s[0]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "fmla z28.s, z5.s, z0.s[0]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z29.s, z6.s, z0.s[0]\n" + "fmla z30.s, z7.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z8.s, z0.s[1]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z9.s, z0.s[1]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z10.s, z0.s[1]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z11.s, z0.s[1]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z12.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z29.s, z13.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z30.s, z14.s, z0.s[1]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "fmla z24.s, z15.s, z0.s[2]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z25.s, z16.s, z0.s[2]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z17.s, z0.s[2]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z18.s, z0.s[2]\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z19.s, z0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z29.s, z20.s, z0.s[2]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z30.s, z21.s, z0.s[2]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[3]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z25.s, z23.s, z0.s[3]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z1.s, z0.s[3]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z2.s, z0.s[3]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z3.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z29.s, z4.s, z0.s[3]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla z30.s, z5.s, z0.s[3]\n" + "bgt 46b\n" + "47:" // Width 7: Multiply loop: Single iteration only + "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "subs x21, x21, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z6.s, z0.s[0]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z7.s, z0.s[0]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z8.s, z0.s[0]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "fmla z27.s, z9.s, z0.s[0]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z10.s, z0.s[0]\n" + "fmla z29.s, z11.s, z0.s[0]\n" + "fmla z30.s, z12.s, z0.s[0]\n" + "ble 48f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z14.s, z0.s[1]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z15.s, z0.s[1]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z16.s, z0.s[1]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z18.s, z0.s[1]\n" + "fmla z30.s, z19.s, z0.s[1]\n" + "ble 48f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z20.s, z0.s[2]\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z21.s, z0.s[2]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z22.s, z0.s[2]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z23.s, z0.s[2]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z1.s, z0.s[2]\n" + "fmla z29.s, z2.s, z0.s[2]\n" + "fmla z30.s, z3.s, z0.s[2]\n" + "ble 48f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z4.s, z0.s[3]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z5.s, z0.s[3]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z6.s, z0.s[3]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z7.s, z0.s[3]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z28.s, z8.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z29.s, z9.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z30.s, z10.s, z0.s[3]\n" + "48:" // Width 7: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 49f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "fmin z29.s, p2/M, z29.s, z16.s\n" + "fmin z30.s, p2/M, z30.s, z16.s\n" + "fmax z29.s, p2/M, z29.s, z17.s\n" + "fmax z30.s, p2/M, z30.s, z17.s\n" + "49:" // Width 7: No activation + "st1w { z24.s }, p2, [%x[output_ptr]]\n" + "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" + "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" + "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" + "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n" + "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n" + "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n" + "addvl %x[output_ptr], %x[output_ptr], #7\n" + "b 57f\n" + "50:" // Width 8 + "mov x21, %x[K]\n" + "mov x20, %x[A_ptr]\n" + "mov x19, #0x7\n" + "msub x19, x24, x19, %x[N]\n" + "whilelt p1.s, XZR, x19\n" + "cbz x22, 51f\n" + "ld1w { z24.s }, p2/Z, [x22]\n" + "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n" + "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n" + "addvl x22, x22, #8\n" + "b 52f\n" + "51:" // Width 8: no bias + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "52:" // Width 8: setup done + "cmp x21, #0x4\n" + "ble 54f\n" + "53:" // Width 8: Multiply loop: Main loop head + "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "sub x21, x21, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z1.s, z0.s[0]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z2.s, z0.s[0]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "cmp x21, #0x4\n" + "fmla z26.s, z3.s, z0.s[0]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z4.s, z0.s[0]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "fmla z28.s, z5.s, z0.s[0]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z6.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "fmla z30.s, z7.s, z0.s[0]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z31.s, z8.s, z0.s[0]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z25.s, z10.s, z0.s[1]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z11.s, z0.s[1]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z12.s, z0.s[1]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z14.s, z0.s[1]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z30.s, z15.s, z0.s[1]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z31.s, z16.s, z0.s[1]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "fmla z24.s, z17.s, z0.s[2]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z25.s, z18.s, z0.s[2]\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z19.s, z0.s[2]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z20.s, z0.s[2]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "fmla z28.s, z21.s, z0.s[2]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z22.s, z0.s[2]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z30.s, z23.s, z0.s[2]\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z31.s, z1.s, z0.s[2]\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "fmla z24.s, z2.s, z0.s[3]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z25.s, z3.s, z0.s[3]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z4.s, z0.s[3]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z5.s, z0.s[3]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "fmla z28.s, z6.s, z0.s[3]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z7.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z30.s, z8.s, z0.s[3]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla z31.s, z9.s, z0.s[3]\n" + "bgt 53b\n" + "54:" // Width 8: Multiply loop: Single iteration only + "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n" + "whilelt p0.s, XZR, x21\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "subs x21, x21, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x20]\n" + "fmla z24.s, z10.s, z0.s[0]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z25.s, z11.s, z0.s[0]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "fmla z26.s, z12.s, z0.s[0]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "fmla z27.s, z13.s, z0.s[0]\n" + "fmla z28.s, z14.s, z0.s[0]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z15.s, z0.s[0]\n" + "fmla z30.s, z16.s, z0.s[0]\n" + "fmla z31.s, z17.s, z0.s[0]\n" + "ble 55f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z18.s, z0.s[1]\n" + "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z19.s, z0.s[1]\n" + "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z20.s, z0.s[1]\n" + "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z21.s, z0.s[1]\n" + "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "fmla z28.s, z22.s, z0.s[1]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z23.s, z0.s[1]\n" + "fmla z30.s, z1.s, z0.s[1]\n" + "fmla z31.s, z2.s, z0.s[1]\n" + "ble 55f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "subs x21, x21, #0x1\n" + "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z3.s, z0.s[2]\n" + "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z4.s, z0.s[2]\n" + "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z5.s, z0.s[2]\n" + "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z6.s, z0.s[2]\n" + "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "fmla z28.s, z7.s, z0.s[2]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z8.s, z0.s[2]\n" + "fmla z30.s, z9.s, z0.s[2]\n" + "fmla z31.s, z10.s, z0.s[2]\n" + "ble 55f\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n" + "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n" + "fmla z25.s, z12.s, z0.s[3]\n" + "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n" + "fmla z26.s, z13.s, z0.s[3]\n" + "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n" + "fmla z27.s, z14.s, z0.s[3]\n" + "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "addvl %x[B_ptr], %x[B_ptr], #8\n" + "fmla z29.s, z16.s, z0.s[3]\n" + "prfm pldl1keep, [%x[B_ptr], #0x400]\n" + "prfm pldl1keep, [%x[B_ptr], #0x440]\n" + "fmla z30.s, z17.s, z0.s[3]\n" + "fmla z31.s, z18.s, z0.s[3]\n" + "55:" // Width 8: Multiply loop: multiply skip + "prfm pldl1keep, [x20, #0x80]\n" + "prfm pstl1keep, [%x[output_ptr], #0x0]\n" + "tbz %x[flags], #1, 56f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "fmin z29.s, p2/M, z29.s, z16.s\n" + "fmin z30.s, p2/M, z30.s, z16.s\n" + "fmin z31.s, p2/M, z31.s, z16.s\n" + "fmax z29.s, p2/M, z29.s, z17.s\n" + "fmax z30.s, p2/M, z30.s, z17.s\n" + "fmax z31.s, p2/M, z31.s, z17.s\n" + "56:" // Width 8: No activation + "st1w { z24.s }, p2, [%x[output_ptr]]\n" + "subs x23, x23, #0x8\n" + "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n" + "sub %x[N], %x[N], x24, LSL #3\n" + "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n" + "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n" + "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n" + "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n" + "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n" + "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n" + "addvl %x[output_ptr], %x[output_ptr], #8\n" + "bgt 1b\n" + "57:" // Exit + + : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr) + : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)) + : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp deleted file mode 100644 index eba98bb74d..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include "../bfloat.hpp" -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - -class hybrid_bf16fp32_dot_4VLx4 -{ -public: - typedef bfloat16 operand_type; - typedef float result_type; - - typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return get_vector_length() * 4; - } - - static constexpr unsigned int k_unroll() - { - return 2; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4; - - hybrid_bf16fp32_dot_4VLx4(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp deleted file mode 100644 index 385a16fe10..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp +++ /dev/null @@ -1,2247 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = ((K + 1) / 2) * 2; - const long loops_count = ((K + 8) / 16) - 1; - K -= loops_count * 16; - const long regs_count = (K / 8) - 1; - K -= (regs_count + 1) * 8; - const long leftovers = K; - const long blocks_count = (K + 1) / 2; - float nullbias[256]; - if (!accumulate && !bias) { - memset(nullbias, 0, (4 * get_vector_length() * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = blocks_count; - const bfloat16 *a_ptr0 = a_ptr0_base; - const bfloat16 *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(float); - const float *biasptr = bias ? bias+x0 : nullbias; - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "mov z23.d, z19.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "mov z23.d, z19.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "mov z24.d, z16.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z25.d, z17.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z27.d, z19.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z20.s, p0, [c_ptr1]\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "st1w z24.s, p0, [c_ptr2]\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "mov z23.d, z19.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "mov z24.d, z16.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "mov z25.d, z17.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z27.d, z19.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z28.d, z16.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z29.d, z17.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z30.d, z18.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z31.d, z19.d\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1w z28.s, p0/z, [c_ptr3]\n" - "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" - "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n" - ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n" - ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n" - ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n" - ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n" - ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n" - ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n" - ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n" - ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n" - ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n" - ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n" - ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n" - ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n" - ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n" - ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n" - ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n" - ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n" - ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n" - ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n" - ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n" - ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n" - ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n" - ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n" - ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n" - ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n" - ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n" - ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n" - ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n" - ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n" - ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n" - "b 5f\n" - "4:\n" - ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" - ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" - ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n" - ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n" - ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" - ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n" - ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n" - ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n" - ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n" - ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n" - ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n" - ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n" - ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n" - ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n" - ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n" - ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" - ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n" - ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n" - ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n" - ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" - ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n" - ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n" - ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n" - ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" - ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n" - ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n" - ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n" - ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" - ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n" - ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n" - ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n" - ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n" - ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n" - ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n" - ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n" - ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n" - ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n" - ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n" - ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n" - ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n" - ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n" - ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n" - ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n" - ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n" - ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n" - ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n" - ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n" - ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n" - ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n" - ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n" - ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n" - ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n" - ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n" - ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n" - ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n" - ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n" - ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n" - ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n" - ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n" - ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n" - ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n" - ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n" - ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n" - ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n" - ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n" - ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n" - ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n" - ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n" - ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n" - ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n" - ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n" - ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n" - ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n" - ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n" - ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n" - ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n" - ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n" - ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n" - ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n" - ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n" - ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n" - ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n" - ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n" - ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n" - ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n" - ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n" - ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n" - ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n" - ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n" - ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" - ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z20.s, p0, [c_ptr1]\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "st1w z24.s, p0, [c_ptr2]\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - "st1w z28.s, p0, [c_ptr3]\n" - "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" - "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" - "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp new file mode 100644 index 0000000000..e344d82dc6 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" +#include "../bfloat.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const bfloat16 *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST ); + +class cls_sve_hybrid_bf16fp32_dot_6x4VL +{ +public: + typedef bfloat16 operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 2; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL; + + cls_sve_hybrid_bf16fp32_dot_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp new file mode 100644 index 0000000000..19385e56ea --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp @@ -0,0 +1,2237 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" +#include "../../bfloat.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_bf16fp32_dot_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const bfloat16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 71f\n" + "cmp %x[M], #0x4\n" + "bgt 57f\n" + "beq 43f\n" + "cmp %x[M], #0x2\n" + "bgt 29f\n" + "beq 15f\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 4f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "b 6f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 5f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "b 6f\n" + "5:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "6:" // Height 1: setup done + "mov x12, #0x0\n" + "7:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 8f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 9f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "b 9f\n" + "8:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "9:" // Height 1: input setup done + "cmp x11, #0x8\n" + "ble 11f\n" + "10:" // Height 1: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "cmp x11, #0x8\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + "bgt 10b\n" + "11:" // Height 1: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + "addvl x15, x15, #4\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + "addvl x15, x15, #4\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + "12:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 7b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 13f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "13:" // Height 1: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "14:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 3b\n" + "b 86f\n" + "15:" // Height 2 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 16f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 17f\n" + "16:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "17:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 18f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z13.d, z9.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "b 20f\n" + "18:" // Height 2: no bias + "tbz %x[flags], #0, 19f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "b 20f\n" + "19:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "20:" // Height 2: setup done + "mov x12, #0x0\n" + "21:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 22f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 23f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "b 23f\n" + "22:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "23:" // Height 2: input setup done + "cmp x11, #0x8\n" + "ble 25f\n" + "24:" // Height 2: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "cmp x11, #0x8\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + "bgt 24b\n" + "25:" // Height 2: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + "26:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 21b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "27:" // Height 2: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "28:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 17b\n" + "b 86f\n" + "29:" // Height 3 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 30f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 31f\n" + "30:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "31:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 32f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "mov z13.d, z9.d\n" + "addvl x14, x14, #4\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "b 34f\n" + "32:" // Height 3: no bias + "tbz %x[flags], #0, 33f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "b 34f\n" + "33:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "34:" // Height 3: setup done + "mov x12, #0x0\n" + "35:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "b 37f\n" + "36:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "37:" // Height 3: input setup done + "cmp x11, #0x8\n" + "ble 39f\n" + "38:" // Height 3: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "cmp x11, #0x8\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + "bgt 38b\n" + "39:" // Height 3: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "add x26, x26, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + "40:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 35b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 41f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "41:" // Height 3: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "42:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 31b\n" + "b 86f\n" + "43:" // Height 4 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 44f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 45f\n" + "44:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "45:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 46f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "b 48f\n" + "46:" // Height 4: no bias + "tbz %x[flags], #0, 47f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "b 48f\n" + "47:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "48:" // Height 4: setup done + "mov x12, #0x0\n" + "49:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 51f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 51f\n" + "50:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "51:" // Height 4: input setup done + "cmp x11, #0x8\n" + "ble 53f\n" + "52:" // Height 4: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x8\n" + ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + "bgt 52b\n" + "53:" // Height 4: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + "54:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 49b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 55f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z23.s, p5/M, z23.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z23.s, p5/M, z23.s, z1.s\n" + "55:" // Height 4: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "56:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 45b\n" + "b 86f\n" + "57:" // Height 5 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 58f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 59f\n" + "58:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "59:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 60f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "b 62f\n" + "60:" // Height 5: no bias + "tbz %x[flags], #0, 61f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 62f\n" + "61:" // Height 5: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "62:" // Height 5: setup done + "mov x12, #0x0\n" + "63:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 64f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 65f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 65f\n" + "64:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "65:" // Height 5: input setup done + "cmp x11, #0x8\n" + "ble 67f\n" + "66:" // Height 5: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x22, x22, #0x10\n" + ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x8\n" + ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" + ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" + ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" + ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" + ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" + ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" + ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" + ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" + ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" + ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" + ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" + ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" + "bgt 66b\n" + "67:" // Height 5: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "add x22, x22, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" + ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" + ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" + ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" + ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" + ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" + ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" + ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" + ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" + ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" + ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" + ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" + ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" + "68:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 63b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 69f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z23.s, p5/M, z23.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z23.s, p5/M, z23.s, z1.s\n" + "fmax z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z1.s\n" + "fmax z26.s, p5/M, z26.s, z1.s\n" + "fmax z27.s, p5/M, z27.s, z1.s\n" + "69:" // Height 5: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "70:" // Height 5: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 59b\n" + "b 86f\n" + "71:" // Height 6 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 72f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 73f\n" + "72:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "73:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 74f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "mov z28.d, z8.d\n" + "mov z29.d, z9.d\n" + "mov z30.d, z10.d\n" + "mov z31.d, z11.d\n" + "b 76f\n" + "74:" // Height 6: no bias + "tbz %x[flags], #0, 75f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x21]\n" + "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 76f\n" + "75:" // Height 6: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "76:" // Height 6: setup done + "mov x12, #0x0\n" + "77:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 78f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 79f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 79f\n" + "78:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "add x20, x22, x19, LSL #1\n" + "79:" // Height 6: input setup done + "cmp x11, #0x8\n" + "ble 81f\n" + "80:" // Height 6: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "ld1rqh { z5.h }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x20, x20, #0x10\n" + ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x8\n" + ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" + ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" + ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" + ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" + ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" + ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" + ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" + ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" + ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" + ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" + ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" + ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" + ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" + ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" + ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" + ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" + ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" + ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" + ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" + ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" + ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" + ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" + ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" + ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" + ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" + ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" + "bgt 80b\n" + "81:" // Height 6: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "ld1rqh { z5.h }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + "add x20, x20, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" + ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" + ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" + ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" + ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" + ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" + ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" + ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n" + ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" + ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" + ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" + ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" + ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" + ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" + ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" + ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" + ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" + ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n" + ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" + ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" + ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x2\n" + ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" + ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" + ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" + ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" + ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" + ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" + ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" + ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" + ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" + ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" + ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" + ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" + ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n" + ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" + ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" + ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" + ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" + ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" + ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" + ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" + ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" + ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" + ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" + ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n" + ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" + ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" + "82:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 77b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 83f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z23.s, p5/M, z23.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z23.s, p5/M, z23.s, z1.s\n" + "fmax z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmin z28.s, p5/M, z28.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z1.s\n" + "fmax z26.s, p5/M, z26.s, z1.s\n" + "fmax z27.s, p5/M, z27.s, z1.s\n" + "fmax z28.s, p5/M, z28.s, z1.s\n" + "fmin z29.s, p5/M, z29.s, z0.s\n" + "fmin z30.s, p5/M, z30.s, z0.s\n" + "fmin z31.s, p5/M, z31.s, z0.s\n" + "fmax z29.s, p5/M, z29.s, z1.s\n" + "fmax z30.s, p5/M, z30.s, z1.s\n" + "fmax z31.s, p5/M, z31.s, z1.s\n" + "83:" // Height 6: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "st1w { z28.s }, p4, [x21]\n" + "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" + "addvl x21, x21, #4\n" + "84:" // Height 6: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 73b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 86f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 85f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "85:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "86:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp deleted file mode 100644 index 641e5c12fd..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include "../bfloat.hpp" -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - -class hybrid_bf16fp32_mmla_4VLx4 -{ -public: - typedef bfloat16 operand_type; - typedef float result_type; - - typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return get_vector_length() * 2; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4; - - hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp deleted file mode 100644 index 76e3546c6f..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp +++ /dev/null @@ -1,3459 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 8) / 16) - 1; - K -= loops_count * 16; - const long regs_count = (K / 8) - 1; - K -= (regs_count + 1) * 8; - const long leftovers = K; - const long blocks_count = (K + 3) / 4; - float nullbias[128]; - if (!accumulate && !bias) { - memset(nullbias, 0, (2 * get_vector_length() * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 8) { - if (rows_to_compute % 8) { - rows_to_compute = 8 - 1; - } else { - rows_to_compute = 8; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (2 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = blocks_count; - const bfloat16 *a_ptr0 = a_ptr0_base; - const bfloat16 *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(float); - const float *biasptr = bias ? bias+x0 : nullbias; - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z1.h, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip1 z18.s, z15.s, z15.s\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z14.s, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "mov z1.h, #0\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "mov z14.s, #0\n" - "zip1 z18.s, z13.s, z14.s\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z5.h, #0\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "mov z1.h, #0\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z5.h, #0\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "mov z1.h, #0\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z5.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp1 z1.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip1 z18.s, z15.s, z15.s\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z18.s, z13.s, z14.s\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "subs %[loops], %[loops], #0x1\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - "trn1 z0.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "st1w z1.s, p0, [c_ptr1]\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z3.h, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z20.d, z16.d\n" - "mov z21.d, z17.d\n" - "mov z22.d, z18.d\n" - "mov z23.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z3.h, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z18.s, z13.s, z14.s\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "mov z14.s, #0\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - "trn2 z8.d, z4.d, z5.d\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z7.h, #0\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "trn2 z9.d, z6.d, z7.d\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z3.h, #0\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn2 z8.d, z4.d, z5.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z7.h, #0\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "trn2 z9.d, z6.d, z7.d\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z3.h, #0\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z7.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp1 z5.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z21.d, z17.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z22.d, z18.d\n" - "mov z23.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "trn2 z8.d, z4.d, z5.d\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - case 5: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "c_ptr1 .req X4\n" - "c_ptr2 .req X5\n" - "c_ptr3 .req X6\n" - "c_ptr4 .req X7\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z5.h, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z21.d, z17.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z22.d, z18.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z23.d, z19.d\n" - "mov z24.d, z16.d\n" - "mov z25.d, z17.d\n" - "mov z26.d, z18.d\n" - "mov z27.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z5.h, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "mov z14.s, #0\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z9.h, #0\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "trn2 z10.d, z8.d, z9.d\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z5.h, #0\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z9.h, #0\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "trn2 z10.d, z8.d, z9.d\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z5.h, #0\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "trn1 z10.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p6/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z9.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - "cbz %[blocks], 5f\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "uzp1 z8.s, z24.s, z25.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp1 z9.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" - ); - break; - case 6: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "c_ptr1 .req X5\n" - "c_ptr2 .req X6\n" - "c_ptr3 .req X7\n" - "c_ptr4 .req X8\n" - "c_ptr5 .req X9\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1rqh z5.h, p7/z, [a_ptr5]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z21.d, z17.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z22.d, z18.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z23.d, z19.d\n" - "add a_ptr5, a_ptr5, #0x10\n" - "mov z24.d, z16.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z25.d, z17.d\n" - "mov z26.d, z18.d\n" - "mov z27.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1rqh z5.h, p7/z, [a_ptr5]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z20.s, z13.s, z14.s\n" - "add a_ptr5, a_ptr5, #0x10\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "ld1w z14.s, p0/z, [c_ptr5]\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p7/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "add a_ptr5, a_ptr5, #0x20\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p7/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z10.d, z8.d, z9.d\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "addvl a_ptr5, a_ptr5, #2\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "trn1 z10.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p6/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p6/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "addvl a_ptr5, a_ptr5, #1\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - "cbz %[blocks], 5f\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "uzp1 z8.s, z24.s, z25.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp2 z9.s, z24.s, z25.s\n" - "uzp1 z10.s, z26.s, z27.s\n" - "uzp2 z11.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "st1w z9.s, p0, [c_ptr5]\n" - "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" - "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" - ); - break; - case 7: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "c_ptr1 .req X6\n" - "c_ptr2 .req X7\n" - "c_ptr3 .req X8\n" - "c_ptr4 .req X9\n" - "c_ptr5 .req X10\n" - "c_ptr6 .req X11\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z7.h, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1rqh z5.h, p7/z, [a_ptr5]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1rqh z6.h, p7/z, [a_ptr6]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "mov z20.d, z16.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z21.d, z17.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z22.d, z18.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z23.d, z19.d\n" - "add a_ptr5, a_ptr5, #0x10\n" - "mov z24.d, z16.d\n" - "add a_ptr6, a_ptr6, #0x10\n" - "mov z25.d, z17.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z26.d, z18.d\n" - "mov z27.d, z19.d\n" - "mov z28.d, z16.d\n" - "mov z29.d, z17.d\n" - "mov z30.d, z18.d\n" - "mov z31.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z7.h, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1rqh z5.h, p7/z, [a_ptr5]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "ld1rqh z6.h, p7/z, [a_ptr6]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "zip1 z20.s, z13.s, z14.s\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "add a_ptr5, a_ptr5, #0x10\n" - "add a_ptr6, a_ptr6, #0x10\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "ld1w z14.s, p0/z, [c_ptr5]\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr6]\n" - "mov z14.s, #0\n" - "zip1 z28.s, z13.s, z14.s\n" - "zip2 z29.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z30.s, z13.s, z14.s\n" - "zip2 z31.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p7/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "add a_ptr5, a_ptr5, #0x20\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1rqh z10.h, p7/z, [a_ptr6]\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z11.h, #0\n" - "add a_ptr6, a_ptr6, #0x20\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z7.h, #0\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p7/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1rqh z10.h, p7/z, [a_ptr6]\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z11.h, #0\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "addvl a_ptr5, a_ptr5, #2\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "addvl a_ptr6, a_ptr6, #2\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z7.h, #0\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "trn1 z11.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p6/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p6/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "addvl a_ptr5, a_ptr5, #1\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1rqh z10.h, p6/z, [a_ptr6]\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z11.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "addvl a_ptr6, a_ptr6, #1\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "trn1 z3.d, z10.d, z11.d\n" - "cbz %[blocks], 5f\n" - "trn2 z11.d, z10.d, z11.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp1 z8.s, z24.s, z25.s\n" - "uzp2 z9.s, z24.s, z25.s\n" - "uzp1 z10.s, z26.s, z27.s\n" - "uzp2 z11.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "st1w z9.s, p0, [c_ptr5]\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "uzp1 z12.s, z28.s, z29.s\n" - "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" - "uzp1 z13.s, z30.s, z31.s\n" - "st1w z12.s, p0, [c_ptr6]\n" - "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory" - ); - break; - default: - case 8: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1rqh z5.h, p7/z, [a_ptr5]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1rqh z6.h, p7/z, [a_ptr6]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1rqh z7.h, p7/z, [a_ptr7]\n" - "mov z20.d, z16.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z21.d, z17.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z22.d, z18.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z23.d, z19.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z24.d, z16.d\n" - "add a_ptr5, a_ptr5, #0x10\n" - "mov z25.d, z17.d\n" - "add a_ptr6, a_ptr6, #0x10\n" - "mov z26.d, z18.d\n" - "add a_ptr7, a_ptr7, #0x10\n" - "mov z27.d, z19.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z28.d, z16.d\n" - "mov z29.d, z17.d\n" - "mov z30.d, z18.d\n" - "mov z31.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqh z4.h, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1rqh z5.h, p7/z, [a_ptr5]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "ld1rqh z6.h, p7/z, [a_ptr6]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1rqh z7.h, p7/z, [a_ptr7]\n" - "zip1 z20.s, z13.s, z14.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "add a_ptr5, a_ptr5, #0x10\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add a_ptr6, a_ptr6, #0x10\n" - "zip1 z22.s, z13.s, z14.s\n" - "add a_ptr7, a_ptr7, #0x10\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "ld1w z14.s, p0/z, [c_ptr5]\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr6]\n" - "ld1w z14.s, p0/z, [c_ptr7]\n" - "zip1 z28.s, z13.s, z14.s\n" - "zip2 z29.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n" - "zip1 z30.s, z13.s, z14.s\n" - "zip2 z31.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p7/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "add a_ptr5, a_ptr5, #0x20\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1rqh z10.h, p7/z, [a_ptr6]\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z11.h, p7/z, [a_ptr7]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "add a_ptr6, a_ptr6, #0x20\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "add a_ptr7, a_ptr7, #0x20\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1rqh z7.h, p7/z, [a_ptr7, #-0x10]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p7/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p7/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1rqh z10.h, p7/z, [a_ptr6]\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z11.h, p7/z, [a_ptr7]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "ld1rqh z7.h, p6/z, [a_ptr7, #0x10]\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - "addvl a_ptr5, a_ptr5, #2\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - "addvl a_ptr6, a_ptr6, #2\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - "addvl a_ptr7, a_ptr7, #2\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "addvl a_ptr3, a_ptr3, #2\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "trn1 z11.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - "ld1rqh z8.h, p6/z, [a_ptr4]\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - "ld1rqh z9.h, p6/z, [a_ptr5]\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - "addvl a_ptr5, a_ptr5, #1\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - "ld1rqh z10.h, p6/z, [a_ptr6]\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "ld1rqh z11.h, p6/z, [a_ptr7]\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - "addvl a_ptr6, a_ptr6, #1\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - "addvl a_ptr7, a_ptr7, #1\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "trn1 z3.d, z10.d, z11.d\n" - "cbz %[blocks], 5f\n" - "trn2 z11.d, z10.d, z11.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0]]\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n" - ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n" - ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n" - ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n" - ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n" - ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n" - ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n" - ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n" - ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n" - ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n" - ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n" - ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n" - ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n" - ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n" - ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n" - ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n" - ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n" - ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n" - ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n" - ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n" - ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n" - ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n" - ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n" - ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n" - ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n" - ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n" - ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n" - ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n" - ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n" - ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp1 z8.s, z24.s, z25.s\n" - "uzp2 z9.s, z24.s, z25.s\n" - "uzp1 z10.s, z26.s, z27.s\n" - "uzp2 z11.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "st1w z9.s, p0, [c_ptr5]\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "uzp1 z12.s, z28.s, z29.s\n" - "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" - "uzp2 z13.s, z28.s, z29.s\n" - "uzp1 z14.s, z30.s, z31.s\n" - "uzp2 z15.s, z30.s, z31.s\n" - "st1w z12.s, p0, [c_ptr6]\n" - "st1w z13.s, p0, [c_ptr7]\n" - "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n" - "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp deleted file mode 100644 index bd457e9d27..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include "../bfloat.hpp" -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - -class hybrid_bf16fp32_mmla_6VLx2 -{ -public: - typedef bfloat16 operand_type; - typedef float result_type; - - typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return get_vector_length() * 3; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2; - - hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp deleted file mode 100644 index 59dc6dc540..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp +++ /dev/null @@ -1,1633 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 8) / 16) - 1; - K -= loops_count * 16; - const long regs_count = (K / 8) - 1; - K -= (regs_count + 1) * 8; - const long leftovers = K; - const long blocks_count = (K + 3) / 4; - float nullbias[192]; - if (!accumulate && !bias) { - memset(nullbias, 0, (3 * get_vector_length() * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (3 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = blocks_count; - const bfloat16 *a_ptr0 = a_ptr0_base; - const bfloat16 *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(float); - const float *biasptr = bias ? bias+x0 : nullbias; - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z1.h, #0\n" - "ld1w z19.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "zip1 z20.s, z19.s, z19.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z21.s, z19.s, z19.s\n" - "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip1 z22.s, z19.s, z19.s\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip2 z23.s, z19.s, z19.s\n" - "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "zip1 z24.s, z19.s, z19.s\n" - "zip2 z25.s, z19.s, z19.s\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z18.s, #0\n" - "ld1w z17.s, p0/z, [%[c_ptr0]]\n" - "mov z1.h, #0\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "zip1 z20.s, z17.s, z18.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z21.s, z17.s, z18.s\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z18.s, #0\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "zip1 z22.s, z17.s, z18.s\n" - "zip2 z23.s, z17.s, z18.s\n" - "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "mov z18.s, #0\n" - "zip1 z24.s, z17.s, z18.s\n" - "zip2 z25.s, z17.s, z18.s\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z3.h, #0\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn2 z4.d, z2.d, z3.d\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "trn1 z0.d, z2.d, z3.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "mov z1.h, #0\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z3.h, #0\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "trn2 z4.d, z2.d, z3.d\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn1 z0.d, z2.d, z3.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z1.h, #0\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #12\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z3.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "5:\n" - "ld1rw z18.s, p7/z, [%[minptr]]\n" - "ld1rw z19.s, p7/z, [%[maxptr]]\n" - "fmax z20.s, p7/m, z20.s, z18.s\n" - "fmax z21.s, p7/m, z21.s, z18.s\n" - "fmax z22.s, p7/m, z22.s, z18.s\n" - "fmax z23.s, p7/m, z23.s, z18.s\n" - "fmin z20.s, p7/m, z20.s, z19.s\n" - "fmin z21.s, p7/m, z21.s, z19.s\n" - "fmin z22.s, p7/m, z22.s, z19.s\n" - "fmin z23.s, p7/m, z23.s, z19.s\n" - "fmax z24.s, p7/m, z24.s, z18.s\n" - "uzp1 z0.s, z20.s, z21.s\n" - "fmax z25.s, p7/m, z25.s, z18.s\n" - "uzp1 z1.s, z22.s, z23.s\n" - "fmin z24.s, p7/m, z24.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z19.s\n" - "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "uzp1 z2.s, z24.s, z25.s\n" - "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z19.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z20.s, z19.s, z19.s\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "zip2 z21.s, z19.s, z19.s\n" - "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z22.s, z19.s, z19.s\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip2 z23.s, z19.s, z19.s\n" - "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "zip1 z24.s, z19.s, z19.s\n" - "zip2 z25.s, z19.s, z19.s\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z17.s, p0/z, [%[c_ptr0]]\n" - "ld1w z18.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z20.s, z17.s, z18.s\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "zip2 z21.s, z17.s, z18.s\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z22.s, z17.s, z18.s\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip2 z23.s, z17.s, z18.s\n" - "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "zip1 z24.s, z17.s, z18.s\n" - "zip2 z25.s, z17.s, z18.s\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z4.d, z2.d, z3.d\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - "trn1 z0.d, z2.d, z3.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - "trn2 z4.d, z2.d, z3.d\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z0.d, z2.d, z3.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #12\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z3.h, p6/z, [a_ptr1]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "5:\n" - "ld1rw z18.s, p7/z, [%[minptr]]\n" - "ld1rw z19.s, p7/z, [%[maxptr]]\n" - "fmax z20.s, p7/m, z20.s, z18.s\n" - "fmax z21.s, p7/m, z21.s, z18.s\n" - "fmax z22.s, p7/m, z22.s, z18.s\n" - "fmax z23.s, p7/m, z23.s, z18.s\n" - "fmin z20.s, p7/m, z20.s, z19.s\n" - "fmin z21.s, p7/m, z21.s, z19.s\n" - "fmin z22.s, p7/m, z22.s, z19.s\n" - "fmin z23.s, p7/m, z23.s, z19.s\n" - "fmax z24.s, p7/m, z24.s, z18.s\n" - "uzp1 z0.s, z20.s, z21.s\n" - "uzp2 z1.s, z20.s, z21.s\n" - "uzp1 z2.s, z22.s, z23.s\n" - "uzp2 z3.s, z22.s, z23.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z24.s, p7/m, z24.s, z19.s\n" - "fmax z25.s, p7/m, z25.s, z18.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmin z25.s, p7/m, z25.s, z19.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "uzp1 z4.s, z24.s, z25.s\n" - "uzp2 z5.s, z24.s, z25.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #3\n" - "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z3.h, #0\n" - "ld1w z19.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z20.s, z19.s, z19.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z21.s, z19.s, z19.s\n" - "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z22.s, z19.s, z19.s\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip2 z23.s, z19.s, z19.s\n" - "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z26.d, z20.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z27.d, z21.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip1 z24.s, z19.s, z19.s\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "zip2 z25.s, z19.s, z19.s\n" - "mov z28.d, z22.d\n" - "mov z29.d, z23.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z25.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z3.h, #0\n" - "ld1w z17.s, p0/z, [%[c_ptr0]]\n" - "ld1w z18.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z20.s, z17.s, z18.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z21.s, z17.s, z18.s\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip1 z22.s, z17.s, z18.s\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip2 z23.s, z17.s, z18.s\n" - "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "zip1 z24.s, z17.s, z18.s\n" - "zip2 z25.s, z17.s, z18.s\n" - "ld1w z17.s, p0/z, [c_ptr2]\n" - "mov z18.s, #0\n" - "zip1 z26.s, z17.s, z18.s\n" - "zip2 z27.s, z17.s, z18.s\n" - "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "mov z18.s, #0\n" - "zip1 z28.s, z17.s, z18.s\n" - "zip2 z29.s, z17.s, z18.s\n" - "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "mov z18.s, #0\n" - "zip1 z30.s, z17.s, z18.s\n" - "zip2 z31.s, z17.s, z18.s\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z5.h, #0\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z3.h, #0\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z5.h, #0\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "mov z3.h, #0\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "trn1 z5.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #12\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z3.h, p6/z, [a_ptr1]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1rqh z4.h, p6/z, [a_ptr2]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z5.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "trn1 z1.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "5:\n" - "ld1rw z18.s, p7/z, [%[minptr]]\n" - "ld1rw z19.s, p7/z, [%[maxptr]]\n" - "fmax z20.s, p7/m, z20.s, z18.s\n" - "fmax z21.s, p7/m, z21.s, z18.s\n" - "fmax z22.s, p7/m, z22.s, z18.s\n" - "fmax z23.s, p7/m, z23.s, z18.s\n" - "fmin z20.s, p7/m, z20.s, z19.s\n" - "fmin z21.s, p7/m, z21.s, z19.s\n" - "fmin z22.s, p7/m, z22.s, z19.s\n" - "fmin z23.s, p7/m, z23.s, z19.s\n" - "fmax z24.s, p7/m, z24.s, z18.s\n" - "uzp1 z0.s, z20.s, z21.s\n" - "uzp2 z1.s, z20.s, z21.s\n" - "uzp1 z2.s, z22.s, z23.s\n" - "uzp2 z3.s, z22.s, z23.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z24.s, p7/m, z24.s, z19.s\n" - "fmax z25.s, p7/m, z25.s, z18.s\n" - "fmax z26.s, p7/m, z26.s, z18.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z27.s, p7/m, z27.s, z18.s\n" - "fmax z28.s, p7/m, z28.s, z18.s\n" - "fmin z25.s, p7/m, z25.s, z19.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z19.s\n" - "fmin z27.s, p7/m, z27.s, z19.s\n" - "fmin z28.s, p7/m, z28.s, z19.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z24.s, z25.s\n" - "uzp2 z5.s, z24.s, z25.s\n" - "uzp1 z6.s, z26.s, z27.s\n" - "fmax z29.s, p7/m, z29.s, z18.s\n" - "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmax z30.s, p7/m, z30.s, z18.s\n" - "addvl %[c_ptr0], %[c_ptr0], #3\n" - "fmax z31.s, p7/m, z31.s, z18.s\n" - "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n" - "fmin z29.s, p7/m, z29.s, z19.s\n" - "fmin z30.s, p7/m, z30.s, z19.s\n" - "fmin z31.s, p7/m, z31.s, z19.s\n" - "st1w z6.s, p0, [c_ptr2]\n" - "uzp1 z7.s, z28.s, z29.s\n" - "uzp1 z8.s, z30.s, z31.s\n" - "st1w z7.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z8.s, p2, [c_ptr2, #2, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z19.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z20.s, z19.s, z19.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z21.s, z19.s, z19.s\n" - "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z22.s, z19.s, z19.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z23.s, z19.s, z19.s\n" - "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z26.d, z20.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z27.d, z21.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip1 z24.s, z19.s, z19.s\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip2 z25.s, z19.s, z19.s\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z28.d, z22.d\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "mov z29.d, z23.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z25.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z17.s, p0/z, [%[c_ptr0]]\n" - "ld1w z18.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z20.s, z17.s, z18.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z21.s, z17.s, z18.s\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z22.s, z17.s, z18.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z23.s, z17.s, z18.s\n" - "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip1 z24.s, z17.s, z18.s\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip2 z25.s, z17.s, z18.s\n" - "ld1w z17.s, p0/z, [c_ptr2]\n" - "ld1w z18.s, p0/z, [c_ptr3]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "zip1 z26.s, z17.s, z18.s\n" - "zip2 z27.s, z17.s, z18.s\n" - "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z18.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z28.s, z17.s, z18.s\n" - "zip2 z29.s, z17.s, z18.s\n" - "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z18.s, p2/z, [c_ptr3, #2, MUL VL]\n" - "zip1 z30.s, z17.s, z18.s\n" - "zip2 z31.s, z17.s, z18.s\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr3]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr3]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "trn1 z5.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #12\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - "ld1rqh z3.h, p6/z, [a_ptr1]\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - "ld1rqh z4.h, p6/z, [a_ptr2]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1rqh z5.h, p6/z, [a_ptr3]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "trn1 z1.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n" - ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n" - ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n" - ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n" - ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n" - ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n" - ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n" - ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n" - ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n" - "b.eq 5f\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n" - ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n" - ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n" - ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n" - ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n" - ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n" - ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n" - ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n" - ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n" - "5:\n" - "ld1rw z18.s, p7/z, [%[minptr]]\n" - "ld1rw z19.s, p7/z, [%[maxptr]]\n" - "fmax z20.s, p7/m, z20.s, z18.s\n" - "fmax z21.s, p7/m, z21.s, z18.s\n" - "fmax z22.s, p7/m, z22.s, z18.s\n" - "fmax z23.s, p7/m, z23.s, z18.s\n" - "fmin z20.s, p7/m, z20.s, z19.s\n" - "fmin z21.s, p7/m, z21.s, z19.s\n" - "fmin z22.s, p7/m, z22.s, z19.s\n" - "fmin z23.s, p7/m, z23.s, z19.s\n" - "fmax z24.s, p7/m, z24.s, z18.s\n" - "uzp1 z0.s, z20.s, z21.s\n" - "uzp2 z1.s, z20.s, z21.s\n" - "uzp1 z2.s, z22.s, z23.s\n" - "uzp2 z3.s, z22.s, z23.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z24.s, p7/m, z24.s, z19.s\n" - "fmax z25.s, p7/m, z25.s, z18.s\n" - "fmax z26.s, p7/m, z26.s, z18.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z27.s, p7/m, z27.s, z18.s\n" - "fmax z28.s, p7/m, z28.s, z18.s\n" - "fmin z25.s, p7/m, z25.s, z19.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z19.s\n" - "fmin z27.s, p7/m, z27.s, z19.s\n" - "fmin z28.s, p7/m, z28.s, z19.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z24.s, z25.s\n" - "uzp2 z5.s, z24.s, z25.s\n" - "uzp1 z6.s, z26.s, z27.s\n" - "uzp2 z7.s, z26.s, z27.s\n" - "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmax z29.s, p7/m, z29.s, z18.s\n" - "addvl %[c_ptr0], %[c_ptr0], #3\n" - "fmax z30.s, p7/m, z30.s, z18.s\n" - "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n" - "fmax z31.s, p7/m, z31.s, z18.s\n" - "fmin z29.s, p7/m, z29.s, z19.s\n" - "fmin z30.s, p7/m, z30.s, z19.s\n" - "st1w z6.s, p0, [c_ptr2]\n" - "fmin z31.s, p7/m, z31.s, z19.s\n" - "uzp1 z8.s, z28.s, z29.s\n" - "uzp2 z9.s, z28.s, z29.s\n" - "st1w z7.s, p0, [c_ptr3]\n" - "uzp1 z10.s, z30.s, z31.s\n" - "uzp2 z11.s, z30.s, z31.s\n" - "st1w z8.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z9.s, p1, [c_ptr3, #1, MUL VL]\n" - "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z11.s, p2, [c_ptr3, #2, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp deleted file mode 100644 index f25f7473cb..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include "../bfloat.hpp" -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - -class hybrid_bf16fp32_mmla_8VLx2 -{ -public: - typedef bfloat16 operand_type; - typedef float result_type; - - typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return get_vector_length() * 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2; - - hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp deleted file mode 100644 index f38a2ea2e3..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp +++ /dev/null @@ -1,2001 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 8) / 16) - 1; - K -= loops_count * 16; - const long regs_count = (K / 8) - 1; - K -= (regs_count + 1) * 8; - const long leftovers = K; - const long blocks_count = (K + 3) / 4; - float nullbias[256]; - if (!accumulate && !bias) { - memset(nullbias, 0, (4 * get_vector_length() * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = blocks_count; - const bfloat16 *a_ptr0 = a_ptr0_base; - const bfloat16 *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(float); - const float *biasptr = bias ? bias+x0 : nullbias; - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z1.h, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "zip1 z20.s, z15.s, z15.s\n" - "zip2 z21.s, z15.s, z15.s\n" - "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "zip1 z22.s, z15.s, z15.s\n" - "zip2 z23.s, z15.s, z15.s\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z14.s, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "mov z1.h, #0\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z14.s, #0\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "zip1 z18.s, z13.s, z14.s\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z3.h, #0\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "trn2 z4.d, z2.d, z3.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "trn1 z0.d, z2.d, z3.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "mov z1.h, #0\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z3.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "trn1 z0.d, z2.d, z3.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z1.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z3.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "uzp1 z1.s, z18.s, z19.s\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "uzp1 z2.s, z20.s, z21.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "uzp1 z3.s, z22.s, z23.s\n" - "st1w z3.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "zip1 z20.s, z15.s, z15.s\n" - "zip2 z21.s, z15.s, z15.s\n" - "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "zip1 z22.s, z15.s, z15.s\n" - "zip2 z23.s, z15.s, z15.s\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "trn2 z4.d, z2.d, z3.d\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "trn1 z0.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "trn2 z4.d, z2.d, z3.d\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "trn1 z0.d, z2.d, z3.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z3.h, p6/z, [a_ptr1]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "uzp1 z4.s, z20.s, z21.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp2 z5.s, z20.s, z21.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "uzp2 z7.s, z22.s, z23.s\n" - "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z3.h, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z24.d, z16.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z25.d, z17.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip1 z20.s, z15.s, z15.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "zip2 z21.s, z15.s, z15.s\n" - "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z27.d, z19.d\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "zip1 z22.s, z15.s, z15.s\n" - "zip2 z23.s, z15.s, z15.s\n" - "mov z28.d, z20.d\n" - "mov z29.d, z21.d\n" - "mov z30.d, z22.d\n" - "mov z31.d, z23.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z3.h, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip1 z20.s, z13.s, z14.s\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "mov z14.s, #0\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z28.s, z13.s, z14.s\n" - "zip2 z29.s, z13.s, z14.s\n" - "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z30.s, z13.s, z14.s\n" - "zip2 z31.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z5.h, #0\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "trn2 z5.d, z4.d, z5.d\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z3.h, #0\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z5.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z5.d, z4.d, z5.d\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "mov z3.h, #0\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "trn1 z5.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z3.h, p6/z, [a_ptr1]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1rqh z4.h, p6/z, [a_ptr2]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z5.h, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "trn1 z1.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "uzp2 z7.s, z22.s, z23.s\n" - "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "uzp1 z8.s, z24.s, z25.s\n" - "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "uzp1 z9.s, z26.s, z27.s\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "st1w z8.s, p0, [c_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "st1w z9.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "uzp1 z10.s, z28.s, z29.s\n" - "uzp1 z11.s, z30.s, z31.s\n" - "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z11.s, p3, [c_ptr2, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z24.d, z16.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z25.d, z17.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip1 z20.s, z15.s, z15.s\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip2 z21.s, z15.s, z15.s\n" - "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z27.d, z19.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z28.d, z20.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z22.s, z15.s, z15.s\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "zip2 z23.s, z15.s, z15.s\n" - "mov z29.d, z21.d\n" - "mov z30.d, z22.d\n" - "mov z31.d, z23.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z4.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "zip1 z20.s, z13.s, z14.s\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z14.s, p2/z, [c_ptr3, #2, MUL VL]\n" - "zip1 z28.s, z13.s, z14.s\n" - "zip2 z29.s, z13.s, z14.s\n" - "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1w z14.s, p3/z, [c_ptr3, #3, MUL VL]\n" - "zip1 z30.s, z13.s, z14.s\n" - "zip2 z31.s, z13.s, z14.s\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr3]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "trn2 z5.d, z4.d, z5.d\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z5.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z3.h, p7/z, [a_ptr1]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1rqh z4.h, p7/z, [a_ptr2]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z5.h, p7/z, [a_ptr3]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z4.d, z5.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z5.d, z4.d, z5.d\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "trn1 z4.d, z0.d, z1.d\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "trn1 z5.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1rqh z3.h, p6/z, [a_ptr1]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - "ld1rqh z4.h, p6/z, [a_ptr2]\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1rqh z5.h, p6/z, [a_ptr3]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - "trn1 z0.d, z2.d, z3.d\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "trn1 z1.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z5.d, z4.d, z5.d\n" - "ld1h z6.h, p7/z, [%[b_ptr0]]\n" - "trn2 z4.d, z2.d, z3.d\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n" - ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n" - ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n" - ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n" - ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n" - ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n" - ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n" - ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n" - ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n" - ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n" - ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n" - ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n" - ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n" - ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n" - ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n" - ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n" - ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n" - ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n" - ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n" - ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n" - ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n" - ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "uzp2 z7.s, z22.s, z23.s\n" - "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "uzp1 z8.s, z24.s, z25.s\n" - "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n" - "uzp2 z9.s, z24.s, z25.s\n" - "uzp1 z10.s, z26.s, z27.s\n" - "uzp2 z11.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr2]\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "st1w z9.s, p0, [c_ptr3]\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "st1w z10.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "uzp1 z12.s, z28.s, z29.s\n" - "st1w z11.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp2 z13.s, z28.s, z29.s\n" - "uzp1 z14.s, z30.s, z31.s\n" - "uzp2 z15.s, z30.s, z31.s\n" - "st1w z12.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z13.s, p2, [c_ptr3, #2, MUL VL]\n" - "st1w z14.s, p3, [c_ptr2, #3, MUL VL]\n" - "st1w z15.s, p3, [c_ptr3, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp deleted file mode 100644 index ebef413848..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - - -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool); - -class hybrid_fp16_mla_4VLx4 -{ -public: - typedef __fp16 operand_type; - typedef __fp16 result_type; - - typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return get_vector_length<__fp16>() * 4; - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_fp16_mla_4VLx4; - - hybrid_fp16_mla_4VLx4(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp deleted file mode 100644 index 7610a20ac0..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp +++ /dev/null @@ -1,3778 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool accumulate) { - const int K_stride = K; - const long loops_count = ((K + 8) / 16) - 1; - K -= loops_count * 16; - const long regs_count = (K / 8) - 1; - K -= (regs_count + 1) * 8; - const long leftovers = K; - __fp16 nullbias[512]; - if (!accumulate && !bias) { - memset(nullbias, 0, (4 * get_vector_length<__fp16>() * sizeof(__fp16))); - } - __fp16 minval = - static_cast<__fp16>(std::numeric_limits::infinity()); - __fp16 maxval = static_cast<__fp16>(std::numeric_limits::infinity()); - const __fp16 * const minptr = &minval; - const __fp16 * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast<__fp16>(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = leftovers; - const __fp16 *a_ptr0 = a_ptr0_base; - const __fp16 *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(__fp16); - const __fp16 *biasptr = bias ? bias+x0 : nullbias; - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p2.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p3.h, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1h z16.h, p0/z, [%[biasptr]]\n" - "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1h z16.h, p0/z, [%[c_ptr0]]\n" - "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "b 5f\n" - "4:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "5:\n" - "ld1rh z14.h, p7/z, [%[minptr]]\n" - "ld1rh z15.h, p7/z, [%[maxptr]]\n" - "fmax z16.h, p7/m, z16.h, z14.h\n" - "fmax z17.h, p7/m, z17.h, z14.h\n" - "fmax z18.h, p7/m, z18.h, z14.h\n" - "fmax z19.h, p7/m, z19.h, z14.h\n" - "fmin z16.h, p7/m, z16.h, z15.h\n" - "fmin z17.h, p7/m, z17.h, z15.h\n" - "fmin z18.h, p7/m, z18.h, z15.h\n" - "fmin z19.h, p7/m, z19.h, z15.h\n" - "st1h z16.h, p0, [%[c_ptr0]]\n" - "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p2.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p3.h, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1h z16.h, p0/z, [%[biasptr]]\n" - "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "mov z23.d, z19.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1h z16.h, p0/z, [%[c_ptr0]]\n" - "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1h z20.h, p0/z, [c_ptr1]\n" - "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "fmla z20.h, z12.h, z5.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z21.h, z13.h, z5.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z22.h, z14.h, z5.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "fmla z23.h, z15.h, z5.h[7]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "fmla z20.h, z12.h, z5.h[7]\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z21.h, z13.h, z5.h[7]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z22.h, z14.h, z5.h[7]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "fmla z23.h, z15.h, z5.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "b 5f\n" - "4:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "5:\n" - "ld1rh z14.h, p7/z, [%[minptr]]\n" - "ld1rh z15.h, p7/z, [%[maxptr]]\n" - "fmax z16.h, p7/m, z16.h, z14.h\n" - "fmax z17.h, p7/m, z17.h, z14.h\n" - "fmax z18.h, p7/m, z18.h, z14.h\n" - "fmax z19.h, p7/m, z19.h, z14.h\n" - "fmin z16.h, p7/m, z16.h, z15.h\n" - "fmin z17.h, p7/m, z17.h, z15.h\n" - "fmin z18.h, p7/m, z18.h, z15.h\n" - "fmin z19.h, p7/m, z19.h, z15.h\n" - "st1h z16.h, p0, [%[c_ptr0]]\n" - "fmax z20.h, p7/m, z20.h, z14.h\n" - "fmax z21.h, p7/m, z21.h, z14.h\n" - "fmax z22.h, p7/m, z22.h, z14.h\n" - "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.h, p7/m, z23.h, z14.h\n" - "fmin z20.h, p7/m, z20.h, z15.h\n" - "fmin z21.h, p7/m, z21.h, z15.h\n" - "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.h, p7/m, z22.h, z15.h\n" - "fmin z23.h, p7/m, z23.h, z15.h\n" - "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1h z20.h, p0, [c_ptr1]\n" - "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n" - "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n" - "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p2.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p3.h, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1h z16.h, p0/z, [%[biasptr]]\n" - "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "mov z23.d, z19.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "mov z24.d, z16.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z25.d, z17.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z27.d, z19.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1h z16.h, p0/z, [%[c_ptr0]]\n" - "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1h z20.h, p0/z, [c_ptr1]\n" - "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1h z24.h, p0/z, [c_ptr2]\n" - "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "fmla z24.h, z12.h, z2.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "fmla z25.h, z13.h, z2.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "fmla z26.h, z14.h, z2.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - "fmla z27.h, z15.h, z2.h[7]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "fmla z24.h, z8.h, z6.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "fmla z25.h, z9.h, z6.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "fmla z26.h, z10.h, z6.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "fmla z27.h, z11.h, z6.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "fmla z24.h, z12.h, z6.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "fmla z25.h, z13.h, z6.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "fmla z26.h, z14.h, z6.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "fmla z27.h, z15.h, z6.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z24.h, z8.h, z6.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "fmla z25.h, z9.h, z6.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "fmla z26.h, z10.h, z6.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "fmla z27.h, z11.h, z6.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "fmla z24.h, z12.h, z6.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "fmla z25.h, z13.h, z6.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "fmla z26.h, z14.h, z6.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "fmla z27.h, z15.h, z6.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "fmla z24.h, z8.h, z6.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "fmla z25.h, z9.h, z6.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "fmla z26.h, z10.h, z6.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "fmla z27.h, z11.h, z6.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "fmla z24.h, z12.h, z6.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "fmla z25.h, z13.h, z6.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "fmla z26.h, z14.h, z6.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "fmla z27.h, z15.h, z6.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z24.h, z8.h, z6.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z25.h, z9.h, z6.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z26.h, z10.h, z6.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "fmla z27.h, z11.h, z6.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "fmla z20.h, z12.h, z5.h[7]\n" - "fmla z24.h, z12.h, z6.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z21.h, z13.h, z5.h[7]\n" - "fmla z25.h, z13.h, z6.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z22.h, z14.h, z5.h[7]\n" - "fmla z26.h, z14.h, z6.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "fmla z23.h, z15.h, z5.h[7]\n" - "fmla z27.h, z15.h, z6.h[7]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "fmla z24.h, z12.h, z2.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "fmla z25.h, z13.h, z2.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "fmla z26.h, z14.h, z2.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - "fmla z27.h, z15.h, z2.h[7]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z24.h, z8.h, z6.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "fmla z25.h, z9.h, z6.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "fmla z26.h, z10.h, z6.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "fmla z27.h, z11.h, z6.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "fmla z24.h, z12.h, z6.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "fmla z25.h, z13.h, z6.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "fmla z26.h, z14.h, z6.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "fmla z27.h, z15.h, z6.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z24.h, z8.h, z6.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "fmla z25.h, z9.h, z6.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "fmla z26.h, z10.h, z6.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "fmla z27.h, z11.h, z6.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "fmla z24.h, z12.h, z6.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "fmla z25.h, z13.h, z6.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "fmla z26.h, z14.h, z6.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "fmla z27.h, z15.h, z6.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "fmla z24.h, z8.h, z6.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "fmla z25.h, z9.h, z6.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "fmla z26.h, z10.h, z6.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "fmla z27.h, z11.h, z6.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "fmla z24.h, z12.h, z6.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "fmla z25.h, z13.h, z6.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "fmla z26.h, z14.h, z6.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "fmla z27.h, z15.h, z6.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z24.h, z8.h, z6.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z25.h, z9.h, z6.h[6]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z26.h, z10.h, z6.h[6]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "fmla z27.h, z11.h, z6.h[6]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "fmla z20.h, z12.h, z5.h[7]\n" - "fmla z24.h, z12.h, z6.h[7]\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z21.h, z13.h, z5.h[7]\n" - "fmla z25.h, z13.h, z6.h[7]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z22.h, z14.h, z5.h[7]\n" - "fmla z26.h, z14.h, z6.h[7]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "fmla z23.h, z15.h, z5.h[7]\n" - "fmla z27.h, z15.h, z6.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "b 5f\n" - "4:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "fmla z24.h, z12.h, z2.h[7]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "fmla z25.h, z13.h, z2.h[7]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "fmla z26.h, z14.h, z2.h[7]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "fmla z27.h, z15.h, z2.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "fmla z24.h, z8.h, z6.h[0]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "fmla z25.h, z9.h, z6.h[0]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "fmla z26.h, z10.h, z6.h[0]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "fmla z27.h, z11.h, z6.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "fmla z24.h, z12.h, z6.h[1]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "fmla z25.h, z13.h, z6.h[1]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "fmla z26.h, z14.h, z6.h[1]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "fmla z27.h, z15.h, z6.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z24.h, z8.h, z6.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "fmla z25.h, z9.h, z6.h[2]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "fmla z26.h, z10.h, z6.h[2]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "fmla z27.h, z11.h, z6.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "fmla z24.h, z12.h, z6.h[3]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "fmla z25.h, z13.h, z6.h[3]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "fmla z26.h, z14.h, z6.h[3]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "fmla z27.h, z15.h, z6.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "fmla z24.h, z8.h, z6.h[4]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "fmla z25.h, z9.h, z6.h[4]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "fmla z26.h, z10.h, z6.h[4]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "fmla z27.h, z11.h, z6.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "fmla z24.h, z12.h, z6.h[5]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "fmla z25.h, z13.h, z6.h[5]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "fmla z26.h, z14.h, z6.h[5]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "fmla z27.h, z15.h, z6.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z24.h, z8.h, z6.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z25.h, z9.h, z6.h[6]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z26.h, z10.h, z6.h[6]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "fmla z27.h, z11.h, z6.h[6]\n" - "5:\n" - "ld1rh z14.h, p7/z, [%[minptr]]\n" - "ld1rh z15.h, p7/z, [%[maxptr]]\n" - "fmax z16.h, p7/m, z16.h, z14.h\n" - "fmax z17.h, p7/m, z17.h, z14.h\n" - "fmax z18.h, p7/m, z18.h, z14.h\n" - "fmax z19.h, p7/m, z19.h, z14.h\n" - "fmin z16.h, p7/m, z16.h, z15.h\n" - "fmin z17.h, p7/m, z17.h, z15.h\n" - "fmin z18.h, p7/m, z18.h, z15.h\n" - "fmin z19.h, p7/m, z19.h, z15.h\n" - "st1h z16.h, p0, [%[c_ptr0]]\n" - "fmax z20.h, p7/m, z20.h, z14.h\n" - "fmax z21.h, p7/m, z21.h, z14.h\n" - "fmax z22.h, p7/m, z22.h, z14.h\n" - "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.h, p7/m, z23.h, z14.h\n" - "fmin z20.h, p7/m, z20.h, z15.h\n" - "fmin z21.h, p7/m, z21.h, z15.h\n" - "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.h, p7/m, z22.h, z15.h\n" - "fmin z23.h, p7/m, z23.h, z15.h\n" - "fmax z24.h, p7/m, z24.h, z14.h\n" - "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmax z25.h, p7/m, z25.h, z14.h\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "fmax z26.h, p7/m, z26.h, z14.h\n" - "st1h z20.h, p0, [c_ptr1]\n" - "fmin z24.h, p7/m, z24.h, z15.h\n" - "fmin z25.h, p7/m, z25.h, z15.h\n" - "fmax z27.h, p7/m, z27.h, z14.h\n" - "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n" - "fmin z26.h, p7/m, z26.h, z15.h\n" - "fmin z27.h, p7/m, z27.h, z15.h\n" - "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n" - "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n" - "st1h z24.h, p0, [c_ptr2]\n" - "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n" - "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n" - "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.h, %[temp], %[leftovers]\n" - "whilelt p0.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "ptrue p7.h\n" - "whilelt p1.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p2.h, %[temp], %[width]\n" - "inch %[temp], all, mul #1\n" - "whilelt p3.h, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1h z16.h, p0/z, [%[biasptr]]\n" - "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "mov z23.d, z19.d\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "mov z24.d, z16.d\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "mov z25.d, z17.d\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z27.d, z19.d\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z28.d, z16.d\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z29.d, z17.d\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z30.d, z18.d\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z31.d, z19.d\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1h z16.h, p0/z, [%[c_ptr0]]\n" - "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1h z20.h, p0/z, [c_ptr1]\n" - "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1h z24.h, p0/z, [c_ptr2]\n" - "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1h z28.h, p0/z, [c_ptr3]\n" - "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n" - "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n" - "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqh z1.h, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqh z2.h, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqh z3.h, p7/z, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - "fmla z28.h, z8.h, z3.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z29.h, z9.h, z3.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z30.h, z10.h, z3.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "fmla z31.h, z11.h, z3.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "fmla z28.h, z12.h, z3.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "fmla z29.h, z13.h, z3.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "fmla z30.h, z14.h, z3.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "fmla z31.h, z15.h, z3.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z28.h, z8.h, z3.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "fmla z29.h, z9.h, z3.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "fmla z30.h, z10.h, z3.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "fmla z31.h, z11.h, z3.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "fmla z28.h, z12.h, z3.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "fmla z29.h, z13.h, z3.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "fmla z30.h, z14.h, z3.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "fmla z31.h, z15.h, z3.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "fmla z28.h, z8.h, z3.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "fmla z29.h, z9.h, z3.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "fmla z30.h, z10.h, z3.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "fmla z31.h, z11.h, z3.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "fmla z28.h, z12.h, z3.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "fmla z29.h, z13.h, z3.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "fmla z30.h, z14.h, z3.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "fmla z31.h, z15.h, z3.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z28.h, z8.h, z3.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "fmla z29.h, z9.h, z3.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "fmla z30.h, z10.h, z3.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "fmla z31.h, z11.h, z3.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "fmla z24.h, z12.h, z2.h[7]\n" - "fmla z28.h, z12.h, z3.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "fmla z25.h, z13.h, z2.h[7]\n" - "fmla z29.h, z13.h, z3.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "fmla z26.h, z14.h, z2.h[7]\n" - "fmla z30.h, z14.h, z3.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n" - "fmla z27.h, z15.h, z2.h[7]\n" - "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n" - "fmla z31.h, z15.h, z3.h[7]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "fmla z24.h, z8.h, z6.h[0]\n" - "fmla z28.h, z8.h, z7.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "fmla z25.h, z9.h, z6.h[0]\n" - "fmla z29.h, z9.h, z7.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "fmla z26.h, z10.h, z6.h[0]\n" - "fmla z30.h, z10.h, z7.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "fmla z27.h, z11.h, z6.h[0]\n" - "fmla z31.h, z11.h, z7.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "fmla z24.h, z12.h, z6.h[1]\n" - "fmla z28.h, z12.h, z7.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "fmla z25.h, z13.h, z6.h[1]\n" - "fmla z29.h, z13.h, z7.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "fmla z26.h, z14.h, z6.h[1]\n" - "fmla z30.h, z14.h, z7.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "fmla z27.h, z15.h, z6.h[1]\n" - "fmla z31.h, z15.h, z7.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z24.h, z8.h, z6.h[2]\n" - "fmla z28.h, z8.h, z7.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "fmla z25.h, z9.h, z6.h[2]\n" - "fmla z29.h, z9.h, z7.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "fmla z26.h, z10.h, z6.h[2]\n" - "fmla z30.h, z10.h, z7.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "fmla z27.h, z11.h, z6.h[2]\n" - "fmla z31.h, z11.h, z7.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "fmla z24.h, z12.h, z6.h[3]\n" - "fmla z28.h, z12.h, z7.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "fmla z25.h, z13.h, z6.h[3]\n" - "fmla z29.h, z13.h, z7.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "fmla z26.h, z14.h, z6.h[3]\n" - "fmla z30.h, z14.h, z7.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "fmla z27.h, z15.h, z6.h[3]\n" - "fmla z31.h, z15.h, z7.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "fmla z24.h, z8.h, z6.h[4]\n" - "fmla z28.h, z8.h, z7.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "fmla z25.h, z9.h, z6.h[4]\n" - "fmla z29.h, z9.h, z7.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "fmla z26.h, z10.h, z6.h[4]\n" - "fmla z30.h, z10.h, z7.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "fmla z27.h, z11.h, z6.h[4]\n" - "fmla z31.h, z11.h, z7.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "fmla z24.h, z12.h, z6.h[5]\n" - "fmla z28.h, z12.h, z7.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "fmla z25.h, z13.h, z6.h[5]\n" - "fmla z29.h, z13.h, z7.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "fmla z26.h, z14.h, z6.h[5]\n" - "fmla z30.h, z14.h, z7.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "fmla z27.h, z15.h, z6.h[5]\n" - "fmla z31.h, z15.h, z7.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z24.h, z8.h, z6.h[6]\n" - "fmla z28.h, z8.h, z7.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z25.h, z9.h, z6.h[6]\n" - "fmla z29.h, z9.h, z7.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z26.h, z10.h, z6.h[6]\n" - "fmla z30.h, z10.h, z7.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "fmla z27.h, z11.h, z6.h[6]\n" - "fmla z31.h, z11.h, z7.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "fmla z20.h, z12.h, z5.h[7]\n" - "fmla z24.h, z12.h, z6.h[7]\n" - "fmla z28.h, z12.h, z7.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z21.h, z13.h, z5.h[7]\n" - "fmla z25.h, z13.h, z6.h[7]\n" - "fmla z29.h, z13.h, z7.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z22.h, z14.h, z5.h[7]\n" - "fmla z26.h, z14.h, z6.h[7]\n" - "fmla z30.h, z14.h, z7.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "fmla z23.h, z15.h, z5.h[7]\n" - "fmla z27.h, z15.h, z6.h[7]\n" - "fmla z31.h, z15.h, z7.h[7]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" - "fmla z28.h, z8.h, z3.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "fmla z29.h, z9.h, z3.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "fmla z30.h, z10.h, z3.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "fmla z31.h, z11.h, z3.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "fmla z28.h, z12.h, z3.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "fmla z29.h, z13.h, z3.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "fmla z30.h, z14.h, z3.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "fmla z31.h, z15.h, z3.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z28.h, z8.h, z3.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "fmla z29.h, z9.h, z3.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "fmla z30.h, z10.h, z3.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "fmla z31.h, z11.h, z3.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "fmla z28.h, z12.h, z3.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "fmla z29.h, z13.h, z3.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "fmla z30.h, z14.h, z3.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "fmla z31.h, z15.h, z3.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "fmla z28.h, z8.h, z3.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "fmla z29.h, z9.h, z3.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "fmla z30.h, z10.h, z3.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "fmla z31.h, z11.h, z3.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "fmla z28.h, z12.h, z3.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "fmla z29.h, z13.h, z3.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "fmla z30.h, z14.h, z3.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "fmla z31.h, z15.h, z3.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z28.h, z8.h, z3.h[6]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "fmla z29.h, z9.h, z3.h[6]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "fmla z30.h, z10.h, z3.h[6]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "fmla z31.h, z11.h, z3.h[6]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "fmla z24.h, z12.h, z2.h[7]\n" - "fmla z28.h, z12.h, z3.h[7]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "fmla z25.h, z13.h, z2.h[7]\n" - "fmla z29.h, z13.h, z3.h[7]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "fmla z26.h, z14.h, z2.h[7]\n" - "fmla z30.h, z14.h, z3.h[7]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n" - "fmla z27.h, z15.h, z2.h[7]\n" - "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n" - "fmla z31.h, z15.h, z3.h[7]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z24.h, z8.h, z6.h[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "fmla z28.h, z8.h, z7.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "addvl a_ptr3, a_ptr3, #2\n" - "fmla z25.h, z9.h, z6.h[0]\n" - "fmla z29.h, z9.h, z7.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "fmla z26.h, z10.h, z6.h[0]\n" - "fmla z30.h, z10.h, z7.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "fmla z27.h, z11.h, z6.h[0]\n" - "fmla z31.h, z11.h, z7.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "fmla z24.h, z12.h, z6.h[1]\n" - "fmla z28.h, z12.h, z7.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "fmla z25.h, z13.h, z6.h[1]\n" - "fmla z29.h, z13.h, z7.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "fmla z26.h, z14.h, z6.h[1]\n" - "fmla z30.h, z14.h, z7.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "fmla z27.h, z15.h, z6.h[1]\n" - "fmla z31.h, z15.h, z7.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z24.h, z8.h, z6.h[2]\n" - "fmla z28.h, z8.h, z7.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "fmla z25.h, z9.h, z6.h[2]\n" - "fmla z29.h, z9.h, z7.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "fmla z26.h, z10.h, z6.h[2]\n" - "fmla z30.h, z10.h, z7.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "fmla z27.h, z11.h, z6.h[2]\n" - "fmla z31.h, z11.h, z7.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "fmla z24.h, z12.h, z6.h[3]\n" - "fmla z28.h, z12.h, z7.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "fmla z25.h, z13.h, z6.h[3]\n" - "fmla z29.h, z13.h, z7.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "fmla z26.h, z14.h, z6.h[3]\n" - "fmla z30.h, z14.h, z7.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "fmla z27.h, z15.h, z6.h[3]\n" - "fmla z31.h, z15.h, z7.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "fmla z24.h, z8.h, z6.h[4]\n" - "fmla z28.h, z8.h, z7.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "fmla z25.h, z9.h, z6.h[4]\n" - "fmla z29.h, z9.h, z7.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "fmla z26.h, z10.h, z6.h[4]\n" - "fmla z30.h, z10.h, z7.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "fmla z27.h, z11.h, z6.h[4]\n" - "fmla z31.h, z11.h, z7.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "fmla z24.h, z12.h, z6.h[5]\n" - "fmla z28.h, z12.h, z7.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "fmla z25.h, z13.h, z6.h[5]\n" - "fmla z29.h, z13.h, z7.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "fmla z26.h, z14.h, z6.h[5]\n" - "fmla z30.h, z14.h, z7.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "fmla z27.h, z15.h, z6.h[5]\n" - "fmla z31.h, z15.h, z7.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z24.h, z8.h, z6.h[6]\n" - "fmla z28.h, z8.h, z7.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z25.h, z9.h, z6.h[6]\n" - "fmla z29.h, z9.h, z7.h[6]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z26.h, z10.h, z6.h[6]\n" - "fmla z30.h, z10.h, z7.h[6]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "fmla z27.h, z11.h, z6.h[6]\n" - "fmla z31.h, z11.h, z7.h[6]\n" - "fmla z16.h, z12.h, z4.h[7]\n" - "fmla z20.h, z12.h, z5.h[7]\n" - "fmla z24.h, z12.h, z6.h[7]\n" - "fmla z28.h, z12.h, z7.h[7]\n" - "fmla z17.h, z13.h, z4.h[7]\n" - "fmla z21.h, z13.h, z5.h[7]\n" - "fmla z25.h, z13.h, z6.h[7]\n" - "fmla z29.h, z13.h, z7.h[7]\n" - "fmla z18.h, z14.h, z4.h[7]\n" - "fmla z22.h, z14.h, z5.h[7]\n" - "fmla z26.h, z14.h, z6.h[7]\n" - "fmla z30.h, z14.h, z7.h[7]\n" - "fmla z19.h, z15.h, z4.h[7]\n" - "fmla z23.h, z15.h, z5.h[7]\n" - "fmla z27.h, z15.h, z6.h[7]\n" - "fmla z31.h, z15.h, z7.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "fmla z28.h, z8.h, z3.h[0]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "fmla z29.h, z9.h, z3.h[0]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "fmla z30.h, z10.h, z3.h[0]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "fmla z31.h, z11.h, z3.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "fmla z28.h, z12.h, z3.h[1]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "fmla z29.h, z13.h, z3.h[1]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "fmla z30.h, z14.h, z3.h[1]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "fmla z31.h, z15.h, z3.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z28.h, z8.h, z3.h[2]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "fmla z29.h, z9.h, z3.h[2]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "fmla z30.h, z10.h, z3.h[2]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "fmla z31.h, z11.h, z3.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "fmla z28.h, z12.h, z3.h[3]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "fmla z29.h, z13.h, z3.h[3]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "fmla z30.h, z14.h, z3.h[3]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "fmla z31.h, z15.h, z3.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "fmla z28.h, z8.h, z3.h[4]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "fmla z29.h, z9.h, z3.h[4]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "fmla z30.h, z10.h, z3.h[4]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "fmla z31.h, z11.h, z3.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "fmla z28.h, z12.h, z3.h[5]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "fmla z29.h, z13.h, z3.h[5]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "fmla z30.h, z14.h, z3.h[5]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "fmla z31.h, z15.h, z3.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z28.h, z8.h, z3.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "fmla z29.h, z9.h, z3.h[6]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "fmla z30.h, z10.h, z3.h[6]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "fmla z31.h, z11.h, z3.h[6]\n" - "b 5f\n" - "4:\n" - "fmla z16.h, z8.h, z0.h[0]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" - "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" - "fmla z28.h, z8.h, z3.h[0]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" - "fmla z21.h, z9.h, z1.h[0]\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" - "fmla z25.h, z9.h, z2.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z29.h, z9.h, z3.h[0]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "fmla z22.h, z10.h, z1.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "fmla z26.h, z10.h, z2.h[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" - "fmla z30.h, z10.h, z3.h[0]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[0]\n" - "fmla z23.h, z11.h, z1.h[0]\n" - "fmla z27.h, z11.h, z2.h[0]\n" - "fmla z31.h, z11.h, z3.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[1]\n" - "fmla z20.h, z12.h, z1.h[1]\n" - "fmla z24.h, z12.h, z2.h[1]\n" - "fmla z28.h, z12.h, z3.h[1]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[1]\n" - "fmla z21.h, z13.h, z1.h[1]\n" - "fmla z25.h, z13.h, z2.h[1]\n" - "fmla z29.h, z13.h, z3.h[1]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[1]\n" - "fmla z22.h, z14.h, z1.h[1]\n" - "fmla z26.h, z14.h, z2.h[1]\n" - "fmla z30.h, z14.h, z3.h[1]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[1]\n" - "fmla z23.h, z15.h, z1.h[1]\n" - "fmla z27.h, z15.h, z2.h[1]\n" - "fmla z31.h, z15.h, z3.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.h, z8.h, z1.h[2]\n" - "fmla z24.h, z8.h, z2.h[2]\n" - "fmla z28.h, z8.h, z3.h[2]\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.h, z9.h, z0.h[2]\n" - "fmla z21.h, z9.h, z1.h[2]\n" - "fmla z25.h, z9.h, z2.h[2]\n" - "fmla z29.h, z9.h, z3.h[2]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[2]\n" - "fmla z22.h, z10.h, z1.h[2]\n" - "fmla z26.h, z10.h, z2.h[2]\n" - "fmla z30.h, z10.h, z3.h[2]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[2]\n" - "fmla z23.h, z11.h, z1.h[2]\n" - "fmla z27.h, z11.h, z2.h[2]\n" - "fmla z31.h, z11.h, z3.h[2]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[3]\n" - "fmla z20.h, z12.h, z1.h[3]\n" - "fmla z24.h, z12.h, z2.h[3]\n" - "fmla z28.h, z12.h, z3.h[3]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[3]\n" - "fmla z21.h, z13.h, z1.h[3]\n" - "fmla z25.h, z13.h, z2.h[3]\n" - "fmla z29.h, z13.h, z3.h[3]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[3]\n" - "fmla z22.h, z14.h, z1.h[3]\n" - "fmla z26.h, z14.h, z2.h[3]\n" - "fmla z30.h, z14.h, z3.h[3]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[3]\n" - "fmla z23.h, z15.h, z1.h[3]\n" - "fmla z27.h, z15.h, z2.h[3]\n" - "fmla z31.h, z15.h, z3.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" - "fmla z20.h, z8.h, z1.h[4]\n" - "fmla z24.h, z8.h, z2.h[4]\n" - "fmla z28.h, z8.h, z3.h[4]\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z0.h[4]\n" - "fmla z21.h, z9.h, z1.h[4]\n" - "fmla z25.h, z9.h, z2.h[4]\n" - "fmla z29.h, z9.h, z3.h[4]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z0.h[4]\n" - "fmla z22.h, z10.h, z1.h[4]\n" - "fmla z26.h, z10.h, z2.h[4]\n" - "fmla z30.h, z10.h, z3.h[4]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" - "fmla z23.h, z11.h, z1.h[4]\n" - "fmla z27.h, z11.h, z2.h[4]\n" - "fmla z31.h, z11.h, z3.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z0.h[5]\n" - "fmla z20.h, z12.h, z1.h[5]\n" - "fmla z24.h, z12.h, z2.h[5]\n" - "fmla z28.h, z12.h, z3.h[5]\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.h, z13.h, z0.h[5]\n" - "fmla z21.h, z13.h, z1.h[5]\n" - "fmla z25.h, z13.h, z2.h[5]\n" - "fmla z29.h, z13.h, z3.h[5]\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.h, z14.h, z0.h[5]\n" - "fmla z22.h, z14.h, z1.h[5]\n" - "fmla z26.h, z14.h, z2.h[5]\n" - "fmla z30.h, z14.h, z3.h[5]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.h, z15.h, z0.h[5]\n" - "fmla z23.h, z15.h, z1.h[5]\n" - "fmla z27.h, z15.h, z2.h[5]\n" - "fmla z31.h, z15.h, z3.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.h, z8.h, z1.h[6]\n" - "fmla z24.h, z8.h, z2.h[6]\n" - "fmla z28.h, z8.h, z3.h[6]\n" - "fmla z17.h, z9.h, z0.h[6]\n" - "fmla z21.h, z9.h, z1.h[6]\n" - "fmla z25.h, z9.h, z2.h[6]\n" - "fmla z29.h, z9.h, z3.h[6]\n" - "fmla z18.h, z10.h, z0.h[6]\n" - "fmla z22.h, z10.h, z1.h[6]\n" - "fmla z26.h, z10.h, z2.h[6]\n" - "fmla z30.h, z10.h, z3.h[6]\n" - "fmla z19.h, z11.h, z0.h[6]\n" - "fmla z23.h, z11.h, z1.h[6]\n" - "fmla z27.h, z11.h, z2.h[6]\n" - "fmla z31.h, z11.h, z3.h[6]\n" - "fmla z16.h, z12.h, z0.h[7]\n" - "fmla z20.h, z12.h, z1.h[7]\n" - "fmla z24.h, z12.h, z2.h[7]\n" - "fmla z28.h, z12.h, z3.h[7]\n" - "fmla z17.h, z13.h, z0.h[7]\n" - "fmla z21.h, z13.h, z1.h[7]\n" - "fmla z25.h, z13.h, z2.h[7]\n" - "fmla z29.h, z13.h, z3.h[7]\n" - "fmla z18.h, z14.h, z0.h[7]\n" - "fmla z22.h, z14.h, z1.h[7]\n" - "fmla z26.h, z14.h, z2.h[7]\n" - "fmla z30.h, z14.h, z3.h[7]\n" - "fmla z19.h, z15.h, z0.h[7]\n" - "fmla z23.h, z15.h, z1.h[7]\n" - "fmla z27.h, z15.h, z2.h[7]\n" - "fmla z31.h, z15.h, z3.h[7]\n" - "cbz %[blocks], 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z5.h[0]\n" - "fmla z24.h, z8.h, z6.h[0]\n" - "fmla z28.h, z8.h, z7.h[0]\n" - "fmla z17.h, z9.h, z4.h[0]\n" - "fmla z21.h, z9.h, z5.h[0]\n" - "fmla z25.h, z9.h, z6.h[0]\n" - "fmla z29.h, z9.h, z7.h[0]\n" - "fmla z18.h, z10.h, z4.h[0]\n" - "fmla z22.h, z10.h, z5.h[0]\n" - "fmla z26.h, z10.h, z6.h[0]\n" - "fmla z30.h, z10.h, z7.h[0]\n" - "fmla z19.h, z11.h, z4.h[0]\n" - "fmla z23.h, z11.h, z5.h[0]\n" - "fmla z27.h, z11.h, z6.h[0]\n" - "fmla z31.h, z11.h, z7.h[0]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[1]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[1]\n" - "fmla z24.h, z12.h, z6.h[1]\n" - "fmla z28.h, z12.h, z7.h[1]\n" - "fmla z17.h, z13.h, z4.h[1]\n" - "fmla z21.h, z13.h, z5.h[1]\n" - "fmla z25.h, z13.h, z6.h[1]\n" - "fmla z29.h, z13.h, z7.h[1]\n" - "fmla z18.h, z14.h, z4.h[1]\n" - "fmla z22.h, z14.h, z5.h[1]\n" - "fmla z26.h, z14.h, z6.h[1]\n" - "fmla z30.h, z14.h, z7.h[1]\n" - "fmla z19.h, z15.h, z4.h[1]\n" - "fmla z23.h, z15.h, z5.h[1]\n" - "fmla z27.h, z15.h, z6.h[1]\n" - "fmla z31.h, z15.h, z7.h[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[2]\n" - "fmla z20.h, z8.h, z5.h[2]\n" - "fmla z24.h, z8.h, z6.h[2]\n" - "fmla z28.h, z8.h, z7.h[2]\n" - "fmla z17.h, z9.h, z4.h[2]\n" - "fmla z21.h, z9.h, z5.h[2]\n" - "fmla z25.h, z9.h, z6.h[2]\n" - "fmla z29.h, z9.h, z7.h[2]\n" - "fmla z18.h, z10.h, z4.h[2]\n" - "fmla z22.h, z10.h, z5.h[2]\n" - "fmla z26.h, z10.h, z6.h[2]\n" - "fmla z30.h, z10.h, z7.h[2]\n" - "fmla z19.h, z11.h, z4.h[2]\n" - "fmla z23.h, z11.h, z5.h[2]\n" - "fmla z27.h, z11.h, z6.h[2]\n" - "fmla z31.h, z11.h, z7.h[2]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[3]\n" - "fmla z24.h, z12.h, z6.h[3]\n" - "fmla z28.h, z12.h, z7.h[3]\n" - "fmla z17.h, z13.h, z4.h[3]\n" - "fmla z21.h, z13.h, z5.h[3]\n" - "fmla z25.h, z13.h, z6.h[3]\n" - "fmla z29.h, z13.h, z7.h[3]\n" - "fmla z18.h, z14.h, z4.h[3]\n" - "fmla z22.h, z14.h, z5.h[3]\n" - "fmla z26.h, z14.h, z6.h[3]\n" - "fmla z30.h, z14.h, z7.h[3]\n" - "fmla z19.h, z15.h, z4.h[3]\n" - "fmla z23.h, z15.h, z5.h[3]\n" - "fmla z27.h, z15.h, z6.h[3]\n" - "fmla z31.h, z15.h, z7.h[3]\n" - "b.eq 5f\n" - "ld1h z8.h, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.h, z8.h, z5.h[4]\n" - "fmla z24.h, z8.h, z6.h[4]\n" - "fmla z28.h, z8.h, z7.h[4]\n" - "fmla z17.h, z9.h, z4.h[4]\n" - "fmla z21.h, z9.h, z5.h[4]\n" - "fmla z25.h, z9.h, z6.h[4]\n" - "fmla z29.h, z9.h, z7.h[4]\n" - "fmla z18.h, z10.h, z4.h[4]\n" - "fmla z22.h, z10.h, z5.h[4]\n" - "fmla z26.h, z10.h, z6.h[4]\n" - "fmla z30.h, z10.h, z7.h[4]\n" - "fmla z19.h, z11.h, z4.h[4]\n" - "fmla z23.h, z11.h, z5.h[4]\n" - "fmla z27.h, z11.h, z6.h[4]\n" - "fmla z31.h, z11.h, z7.h[4]\n" - "b.eq 5f\n" - "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" - "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.h, z12.h, z5.h[5]\n" - "fmla z24.h, z12.h, z6.h[5]\n" - "fmla z28.h, z12.h, z7.h[5]\n" - "fmla z17.h, z13.h, z4.h[5]\n" - "fmla z21.h, z13.h, z5.h[5]\n" - "fmla z25.h, z13.h, z6.h[5]\n" - "fmla z29.h, z13.h, z7.h[5]\n" - "fmla z18.h, z14.h, z4.h[5]\n" - "fmla z22.h, z14.h, z5.h[5]\n" - "fmla z26.h, z14.h, z6.h[5]\n" - "fmla z30.h, z14.h, z7.h[5]\n" - "fmla z19.h, z15.h, z4.h[5]\n" - "fmla z23.h, z15.h, z5.h[5]\n" - "fmla z27.h, z15.h, z6.h[5]\n" - "fmla z31.h, z15.h, z7.h[5]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[6]\n" - "fmla z20.h, z8.h, z5.h[6]\n" - "fmla z24.h, z8.h, z6.h[6]\n" - "fmla z28.h, z8.h, z7.h[6]\n" - "fmla z17.h, z9.h, z4.h[6]\n" - "fmla z21.h, z9.h, z5.h[6]\n" - "fmla z25.h, z9.h, z6.h[6]\n" - "fmla z29.h, z9.h, z7.h[6]\n" - "fmla z18.h, z10.h, z4.h[6]\n" - "fmla z22.h, z10.h, z5.h[6]\n" - "fmla z26.h, z10.h, z6.h[6]\n" - "fmla z30.h, z10.h, z7.h[6]\n" - "fmla z19.h, z11.h, z4.h[6]\n" - "fmla z23.h, z11.h, z5.h[6]\n" - "fmla z27.h, z11.h, z6.h[6]\n" - "fmla z31.h, z11.h, z7.h[6]\n" - "5:\n" - "ld1rh z14.h, p7/z, [%[minptr]]\n" - "ld1rh z15.h, p7/z, [%[maxptr]]\n" - "fmax z16.h, p7/m, z16.h, z14.h\n" - "fmax z17.h, p7/m, z17.h, z14.h\n" - "fmax z18.h, p7/m, z18.h, z14.h\n" - "fmax z19.h, p7/m, z19.h, z14.h\n" - "fmin z16.h, p7/m, z16.h, z15.h\n" - "fmin z17.h, p7/m, z17.h, z15.h\n" - "fmin z18.h, p7/m, z18.h, z15.h\n" - "fmin z19.h, p7/m, z19.h, z15.h\n" - "st1h z16.h, p0, [%[c_ptr0]]\n" - "fmax z20.h, p7/m, z20.h, z14.h\n" - "fmax z21.h, p7/m, z21.h, z14.h\n" - "fmax z22.h, p7/m, z22.h, z14.h\n" - "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.h, p7/m, z23.h, z14.h\n" - "fmin z20.h, p7/m, z20.h, z15.h\n" - "fmin z21.h, p7/m, z21.h, z15.h\n" - "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.h, p7/m, z22.h, z15.h\n" - "fmin z23.h, p7/m, z23.h, z15.h\n" - "fmax z24.h, p7/m, z24.h, z14.h\n" - "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmax z25.h, p7/m, z25.h, z14.h\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "fmax z26.h, p7/m, z26.h, z14.h\n" - "st1h z20.h, p0, [c_ptr1]\n" - "fmin z24.h, p7/m, z24.h, z15.h\n" - "fmin z25.h, p7/m, z25.h, z15.h\n" - "fmax z27.h, p7/m, z27.h, z14.h\n" - "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n" - "fmin z26.h, p7/m, z26.h, z15.h\n" - "fmax z28.h, p7/m, z28.h, z14.h\n" - "fmax z29.h, p7/m, z29.h, z14.h\n" - "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n" - "fmin z27.h, p7/m, z27.h, z15.h\n" - "fmax z30.h, p7/m, z30.h, z14.h\n" - "fmin z28.h, p7/m, z28.h, z15.h\n" - "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n" - "fmin z29.h, p7/m, z29.h, z15.h\n" - "fmax z31.h, p7/m, z31.h, z14.h\n" - "fmin z30.h, p7/m, z30.h, z15.h\n" - "st1h z24.h, p0, [c_ptr2]\n" - "fmin z31.h, p7/m, z31.h, z15.h\n" - "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n" - "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n" - "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n" - "st1h z28.h, p0, [c_ptr3]\n" - "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n" - "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n" - "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp new file mode 100644 index 0000000000..0260050f29 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg<__fp16>, \ + size_t, size_t, \ + const __fp16 *, \ + IndirectOutputArg<__fp16>, \ + const __fp16 *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_fp16_mla_6x4VL( ARGLIST ); + +class cls_sve_hybrid_fp16_mla_6x4VL +{ +public: + typedef __fp16 operand_type; + typedef __fp16 result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length<__fp16>() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp16_mla_6x4VL; + + cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp new file mode 100644 index 0000000000..b19842b122 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp @@ -0,0 +1,3178 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_fp16_mla_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg, + size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg, + const __fp16 *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + __fp16 maxval = static_cast<__fp16>(std::numeric_limits::infinity()); + __fp16 minval = - static_cast<__fp16>(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const __fp16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast<__fp16>(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 71f\n" + "cmp %x[M], #0x4\n" + "bgt 57f\n" + "beq 43f\n" + "cmp %x[M], #0x2\n" + "bgt 29f\n" + "beq 15f\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.h, x19, x16\n" + "inch x19\n" + "whilelt p3.h, x19, x16\n" + "inch x19\n" + "whilelt p2.h, x19, x16\n" + "inch x19\n" + "whilelt p1.h, x19, x16\n" + "cbz x14, 4f\n" + "ld1h { z8.h }, p5/Z, [x14]\n" + "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "b 6f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 5f\n" + "ld1h { z8.h }, p4/Z, [x13]\n" + "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" + "b 6f\n" + "5:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "6:" // Height 1: setup done + "mov x12, #0x0\n" + "7:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 8f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 9f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "b 9f\n" + "8:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "9:" // Height 1: input setup done + "cmp x11, #0x8\n" + "ble 11f\n" + "10:" // Height 1: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "cmp x11, #0x8\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "bgt 10b\n" + "11:" // Height 1: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "addvl x15, x15, #4\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "addvl x15, x15, #4\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "addvl x15, x15, #4\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "addvl x15, x15, #4\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "addvl x15, x15, #4\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "addvl x15, x15, #4\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "ble 12f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "12:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 7b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 13f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z1.h }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z0.h }, p5/Z, [x19]\n" + "fmin z8.h, p5/M, z8.h, z0.h\n" + "fmin z9.h, p5/M, z9.h, z0.h\n" + "fmin z10.h, p5/M, z10.h, z0.h\n" + "fmin z11.h, p5/M, z11.h, z0.h\n" + "fmax z8.h, p5/M, z8.h, z1.h\n" + "fmax z9.h, p5/M, z9.h, z1.h\n" + "fmax z10.h, p5/M, z10.h, z1.h\n" + "fmax z11.h, p5/M, z11.h, z1.h\n" + "13:" // Height 1: No activation + "st1h { z8.h }, p4, [x13]\n" + "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "14:" // Height 1: Writeback done + "mov x19, #0x0\n" + "inch x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 3b\n" + "b 86f\n" + "15:" // Height 2 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 16f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #1\n" + "b 17f\n" + "16:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "17:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.h, x19, x16\n" + "inch x19\n" + "whilelt p3.h, x19, x16\n" + "inch x19\n" + "whilelt p2.h, x19, x16\n" + "inch x19\n" + "whilelt p1.h, x19, x16\n" + "cbz x14, 18f\n" + "ld1h { z8.h }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "mov z13.d, z9.d\n" + "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "b 20f\n" + "18:" // Height 2: no bias + "tbz %x[flags], #0, 19f\n" + "ld1h { z8.h }, p4/Z, [x13]\n" + "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x9]\n" + "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" + "b 20f\n" + "19:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "20:" // Height 2: setup done + "mov x12, #0x0\n" + "21:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 22f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 23f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "b 23f\n" + "22:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "23:" // Height 2: input setup done + "cmp x11, #0x8\n" + "ble 25f\n" + "24:" // Height 2: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "cmp x11, #0x8\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "bgt 24b\n" + "25:" // Height 2: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "ble 26f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "26:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 21b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z1.h }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z0.h }, p5/Z, [x19]\n" + "fmin z8.h, p5/M, z8.h, z0.h\n" + "fmin z9.h, p5/M, z9.h, z0.h\n" + "fmin z10.h, p5/M, z10.h, z0.h\n" + "fmin z11.h, p5/M, z11.h, z0.h\n" + "fmin z12.h, p5/M, z12.h, z0.h\n" + "fmax z8.h, p5/M, z8.h, z1.h\n" + "fmax z9.h, p5/M, z9.h, z1.h\n" + "fmax z10.h, p5/M, z10.h, z1.h\n" + "fmax z11.h, p5/M, z11.h, z1.h\n" + "fmax z12.h, p5/M, z12.h, z1.h\n" + "fmin z13.h, p5/M, z13.h, z0.h\n" + "fmin z14.h, p5/M, z14.h, z0.h\n" + "fmin z15.h, p5/M, z15.h, z0.h\n" + "fmax z13.h, p5/M, z13.h, z1.h\n" + "fmax z14.h, p5/M, z14.h, z1.h\n" + "fmax z15.h, p5/M, z15.h, z1.h\n" + "27:" // Height 2: No activation + "st1h { z8.h }, p4, [x13]\n" + "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1h { z12.h }, p4, [x9]\n" + "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "28:" // Height 2: Writeback done + "mov x19, #0x0\n" + "inch x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 17b\n" + "b 86f\n" + "29:" // Height 3 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 30f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "add x27, x27, x19, LSL #1\n" + "b 31f\n" + "30:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "31:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.h, x19, x16\n" + "inch x19\n" + "whilelt p3.h, x19, x16\n" + "inch x19\n" + "whilelt p2.h, x19, x16\n" + "inch x19\n" + "whilelt p1.h, x19, x16\n" + "cbz x14, 32f\n" + "ld1h { z8.h }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" + "mov z13.d, z9.d\n" + "addvl x14, x14, #4\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "b 34f\n" + "32:" // Height 3: no bias + "tbz %x[flags], #0, 33f\n" + "ld1h { z8.h }, p4/Z, [x13]\n" + "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x9]\n" + "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x27]\n" + "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" + "b 34f\n" + "33:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "34:" // Height 3: setup done + "mov x12, #0x0\n" + "35:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "b 37f\n" + "36:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "37:" // Height 3: input setup done + "cmp x11, #0x8\n" + "ble 39f\n" + "38:" // Height 3: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "cmp x11, #0x8\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "bgt 38b\n" + "39:" // Height 3: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "add x26, x26, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "ble 40f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "40:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 35b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 41f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z1.h }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z0.h }, p5/Z, [x19]\n" + "fmin z8.h, p5/M, z8.h, z0.h\n" + "fmin z9.h, p5/M, z9.h, z0.h\n" + "fmin z10.h, p5/M, z10.h, z0.h\n" + "fmin z11.h, p5/M, z11.h, z0.h\n" + "fmin z12.h, p5/M, z12.h, z0.h\n" + "fmax z8.h, p5/M, z8.h, z1.h\n" + "fmax z9.h, p5/M, z9.h, z1.h\n" + "fmax z10.h, p5/M, z10.h, z1.h\n" + "fmax z11.h, p5/M, z11.h, z1.h\n" + "fmax z12.h, p5/M, z12.h, z1.h\n" + "fmin z13.h, p5/M, z13.h, z0.h\n" + "fmin z14.h, p5/M, z14.h, z0.h\n" + "fmin z15.h, p5/M, z15.h, z0.h\n" + "fmin z16.h, p5/M, z16.h, z0.h\n" + "fmax z13.h, p5/M, z13.h, z1.h\n" + "fmax z14.h, p5/M, z14.h, z1.h\n" + "fmax z15.h, p5/M, z15.h, z1.h\n" + "fmax z16.h, p5/M, z16.h, z1.h\n" + "fmin z17.h, p5/M, z17.h, z0.h\n" + "fmin z18.h, p5/M, z18.h, z0.h\n" + "fmin z19.h, p5/M, z19.h, z0.h\n" + "fmax z17.h, p5/M, z17.h, z1.h\n" + "fmax z18.h, p5/M, z18.h, z1.h\n" + "fmax z19.h, p5/M, z19.h, z1.h\n" + "41:" // Height 3: No activation + "st1h { z8.h }, p4, [x13]\n" + "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1h { z12.h }, p4, [x9]\n" + "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1h { z16.h }, p4, [x27]\n" + "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "42:" // Height 3: Writeback done + "mov x19, #0x0\n" + "inch x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 31b\n" + "b 86f\n" + "43:" // Height 4 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 44f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" + "b 45f\n" + "44:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "add x25, x27, x19, LSL #1\n" + "45:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.h, x19, x16\n" + "inch x19\n" + "whilelt p3.h, x19, x16\n" + "inch x19\n" + "whilelt p2.h, x19, x16\n" + "inch x19\n" + "whilelt p1.h, x19, x16\n" + "cbz x14, 46f\n" + "ld1h { z8.h }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "b 48f\n" + "46:" // Height 4: no bias + "tbz %x[flags], #0, 47f\n" + "ld1h { z8.h }, p4/Z, [x13]\n" + "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x9]\n" + "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x27]\n" + "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x25]\n" + "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n" + "b 48f\n" + "47:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "48:" // Height 4: setup done + "mov x12, #0x0\n" + "49:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 51f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 51f\n" + "50:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "51:" // Height 4: input setup done + "cmp x11, #0x8\n" + "ble 53f\n" + "52:" // Height 4: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x8\n" + "fmla z20.h, z6.h, z3.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z21.h, z7.h, z3.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "fmla z22.h, z6.h, z3.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z23.h, z7.h, z3.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "fmla z20.h, z6.h, z3.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "fmla z21.h, z7.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "fmla z22.h, z6.h, z3.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z23.h, z7.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "fmla z20.h, z6.h, z3.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "fmla z21.h, z7.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "fmla z22.h, z6.h, z3.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z23.h, z7.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "fmla z20.h, z6.h, z3.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "fmla z21.h, z7.h, z3.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "fmla z22.h, z6.h, z3.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z23.h, z7.h, z3.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "fmla z20.h, z6.h, z3.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "fmla z21.h, z7.h, z3.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "fmla z22.h, z6.h, z3.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z23.h, z7.h, z3.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "fmla z20.h, z6.h, z3.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "fmla z21.h, z7.h, z3.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "fmla z22.h, z6.h, z3.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z23.h, z7.h, z3.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "fmla z20.h, z6.h, z3.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "fmla z21.h, z7.h, z3.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "fmla z22.h, z6.h, z3.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z23.h, z7.h, z3.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "fmla z20.h, z6.h, z3.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "fmla z21.h, z7.h, z3.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z22.h, z6.h, z3.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z23.h, z7.h, z3.h[7]\n" + "bgt 52b\n" + "53:" // Height 4: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "add x24, x24, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z20.h, z6.h, z3.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z21.h, z7.h, z3.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "fmla z22.h, z6.h, z3.h[0]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z23.h, z7.h, z3.h[0]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "fmla z20.h, z6.h, z3.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "fmla z21.h, z7.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "fmla z22.h, z6.h, z3.h[1]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z23.h, z7.h, z3.h[1]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "fmla z20.h, z6.h, z3.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "fmla z21.h, z7.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "fmla z22.h, z6.h, z3.h[2]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z23.h, z7.h, z3.h[2]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "fmla z20.h, z6.h, z3.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "fmla z21.h, z7.h, z3.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "fmla z22.h, z6.h, z3.h[3]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z23.h, z7.h, z3.h[3]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "fmla z20.h, z6.h, z3.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "fmla z21.h, z7.h, z3.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "fmla z22.h, z6.h, z3.h[4]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z23.h, z7.h, z3.h[4]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "fmla z20.h, z6.h, z3.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "fmla z21.h, z7.h, z3.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "fmla z22.h, z6.h, z3.h[5]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z23.h, z7.h, z3.h[5]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "fmla z20.h, z6.h, z3.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "fmla z21.h, z7.h, z3.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "fmla z22.h, z6.h, z3.h[6]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z23.h, z7.h, z3.h[6]\n" + "ble 54f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "fmla z20.h, z6.h, z3.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "fmla z21.h, z7.h, z3.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z22.h, z6.h, z3.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z23.h, z7.h, z3.h[7]\n" + "54:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 49b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 55f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z1.h }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z0.h }, p5/Z, [x19]\n" + "fmin z8.h, p5/M, z8.h, z0.h\n" + "fmin z9.h, p5/M, z9.h, z0.h\n" + "fmin z10.h, p5/M, z10.h, z0.h\n" + "fmin z11.h, p5/M, z11.h, z0.h\n" + "fmin z12.h, p5/M, z12.h, z0.h\n" + "fmax z8.h, p5/M, z8.h, z1.h\n" + "fmax z9.h, p5/M, z9.h, z1.h\n" + "fmax z10.h, p5/M, z10.h, z1.h\n" + "fmax z11.h, p5/M, z11.h, z1.h\n" + "fmax z12.h, p5/M, z12.h, z1.h\n" + "fmin z13.h, p5/M, z13.h, z0.h\n" + "fmin z14.h, p5/M, z14.h, z0.h\n" + "fmin z15.h, p5/M, z15.h, z0.h\n" + "fmin z16.h, p5/M, z16.h, z0.h\n" + "fmax z13.h, p5/M, z13.h, z1.h\n" + "fmax z14.h, p5/M, z14.h, z1.h\n" + "fmax z15.h, p5/M, z15.h, z1.h\n" + "fmax z16.h, p5/M, z16.h, z1.h\n" + "fmin z17.h, p5/M, z17.h, z0.h\n" + "fmin z18.h, p5/M, z18.h, z0.h\n" + "fmin z19.h, p5/M, z19.h, z0.h\n" + "fmin z20.h, p5/M, z20.h, z0.h\n" + "fmax z17.h, p5/M, z17.h, z1.h\n" + "fmax z18.h, p5/M, z18.h, z1.h\n" + "fmax z19.h, p5/M, z19.h, z1.h\n" + "fmax z20.h, p5/M, z20.h, z1.h\n" + "fmin z21.h, p5/M, z21.h, z0.h\n" + "fmin z22.h, p5/M, z22.h, z0.h\n" + "fmin z23.h, p5/M, z23.h, z0.h\n" + "fmax z21.h, p5/M, z21.h, z1.h\n" + "fmax z22.h, p5/M, z22.h, z1.h\n" + "fmax z23.h, p5/M, z23.h, z1.h\n" + "55:" // Height 4: No activation + "st1h { z8.h }, p4, [x13]\n" + "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1h { z12.h }, p4, [x9]\n" + "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1h { z16.h }, p4, [x27]\n" + "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1h { z20.h }, p4, [x25]\n" + "st1h { z21.h }, p3, [x25, #1, MUL VL]\n" + "st1h { z22.h }, p2, [x25, #2, MUL VL]\n" + "st1h { z23.h }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "56:" // Height 4: Writeback done + "mov x19, #0x0\n" + "inch x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 45b\n" + "b 86f\n" + "57:" // Height 5 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 58f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 59f\n" + "58:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "add x25, x27, x19, LSL #1\n" + "add x23, x25, x19, LSL #1\n" + "59:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.h, x19, x16\n" + "inch x19\n" + "whilelt p3.h, x19, x16\n" + "inch x19\n" + "whilelt p2.h, x19, x16\n" + "inch x19\n" + "whilelt p1.h, x19, x16\n" + "cbz x14, 60f\n" + "ld1h { z8.h }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "b 62f\n" + "60:" // Height 5: no bias + "tbz %x[flags], #0, 61f\n" + "ld1h { z8.h }, p4/Z, [x13]\n" + "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x9]\n" + "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x27]\n" + "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x25]\n" + "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x23]\n" + "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n" + "b 62f\n" + "61:" // Height 5: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "62:" // Height 5: setup done + "mov x12, #0x0\n" + "63:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 64f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 65f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 65f\n" + "64:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "65:" // Height 5: input setup done + "cmp x11, #0x8\n" + "ble 67f\n" + "66:" // Height 5: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla z20.h, z6.h, z3.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x8\n" + "fmla z24.h, z6.h, z4.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z21.h, z7.h, z3.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z25.h, z7.h, z4.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "fmla z22.h, z6.h, z3.h[0]\n" + "fmla z26.h, z6.h, z4.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z23.h, z7.h, z3.h[0]\n" + "fmla z27.h, z7.h, z4.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "fmla z20.h, z6.h, z3.h[1]\n" + "fmla z24.h, z6.h, z4.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "fmla z21.h, z7.h, z3.h[1]\n" + "fmla z25.h, z7.h, z4.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "fmla z22.h, z6.h, z3.h[1]\n" + "fmla z26.h, z6.h, z4.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z23.h, z7.h, z3.h[1]\n" + "fmla z27.h, z7.h, z4.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "fmla z20.h, z6.h, z3.h[2]\n" + "fmla z24.h, z6.h, z4.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "fmla z21.h, z7.h, z3.h[2]\n" + "fmla z25.h, z7.h, z4.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "fmla z22.h, z6.h, z3.h[2]\n" + "fmla z26.h, z6.h, z4.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z23.h, z7.h, z3.h[2]\n" + "fmla z27.h, z7.h, z4.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "fmla z20.h, z6.h, z3.h[3]\n" + "fmla z24.h, z6.h, z4.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "fmla z21.h, z7.h, z3.h[3]\n" + "fmla z25.h, z7.h, z4.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "fmla z22.h, z6.h, z3.h[3]\n" + "fmla z26.h, z6.h, z4.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z23.h, z7.h, z3.h[3]\n" + "fmla z27.h, z7.h, z4.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "fmla z20.h, z6.h, z3.h[4]\n" + "fmla z24.h, z6.h, z4.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "fmla z21.h, z7.h, z3.h[4]\n" + "fmla z25.h, z7.h, z4.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "fmla z22.h, z6.h, z3.h[4]\n" + "fmla z26.h, z6.h, z4.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z23.h, z7.h, z3.h[4]\n" + "fmla z27.h, z7.h, z4.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "fmla z20.h, z6.h, z3.h[5]\n" + "fmla z24.h, z6.h, z4.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "fmla z21.h, z7.h, z3.h[5]\n" + "fmla z25.h, z7.h, z4.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "fmla z22.h, z6.h, z3.h[5]\n" + "fmla z26.h, z6.h, z4.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z23.h, z7.h, z3.h[5]\n" + "fmla z27.h, z7.h, z4.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "fmla z20.h, z6.h, z3.h[6]\n" + "fmla z24.h, z6.h, z4.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "fmla z21.h, z7.h, z3.h[6]\n" + "fmla z25.h, z7.h, z4.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "fmla z22.h, z6.h, z3.h[6]\n" + "fmla z26.h, z6.h, z4.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z23.h, z7.h, z3.h[6]\n" + "fmla z27.h, z7.h, z4.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "fmla z20.h, z6.h, z3.h[7]\n" + "fmla z24.h, z6.h, z4.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "fmla z21.h, z7.h, z3.h[7]\n" + "fmla z25.h, z7.h, z4.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z22.h, z6.h, z3.h[7]\n" + "fmla z26.h, z6.h, z4.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z27.h, z7.h, z4.h[7]\n" + "bgt 66b\n" + "67:" // Height 5: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "add x22, x22, #0x10\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z20.h, z6.h, z3.h[0]\n" + "fmla z24.h, z6.h, z4.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z21.h, z7.h, z3.h[0]\n" + "fmla z25.h, z7.h, z4.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "fmla z22.h, z6.h, z3.h[0]\n" + "fmla z26.h, z6.h, z4.h[0]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z23.h, z7.h, z3.h[0]\n" + "fmla z27.h, z7.h, z4.h[0]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "fmla z20.h, z6.h, z3.h[1]\n" + "fmla z24.h, z6.h, z4.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "fmla z21.h, z7.h, z3.h[1]\n" + "fmla z25.h, z7.h, z4.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "fmla z22.h, z6.h, z3.h[1]\n" + "fmla z26.h, z6.h, z4.h[1]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z23.h, z7.h, z3.h[1]\n" + "fmla z27.h, z7.h, z4.h[1]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "fmla z20.h, z6.h, z3.h[2]\n" + "fmla z24.h, z6.h, z4.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "fmla z21.h, z7.h, z3.h[2]\n" + "fmla z25.h, z7.h, z4.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "fmla z22.h, z6.h, z3.h[2]\n" + "fmla z26.h, z6.h, z4.h[2]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z23.h, z7.h, z3.h[2]\n" + "fmla z27.h, z7.h, z4.h[2]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "fmla z20.h, z6.h, z3.h[3]\n" + "fmla z24.h, z6.h, z4.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "fmla z21.h, z7.h, z3.h[3]\n" + "fmla z25.h, z7.h, z4.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "fmla z22.h, z6.h, z3.h[3]\n" + "fmla z26.h, z6.h, z4.h[3]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z23.h, z7.h, z3.h[3]\n" + "fmla z27.h, z7.h, z4.h[3]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "fmla z20.h, z6.h, z3.h[4]\n" + "fmla z24.h, z6.h, z4.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "fmla z21.h, z7.h, z3.h[4]\n" + "fmla z25.h, z7.h, z4.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "fmla z22.h, z6.h, z3.h[4]\n" + "fmla z26.h, z6.h, z4.h[4]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z23.h, z7.h, z3.h[4]\n" + "fmla z27.h, z7.h, z4.h[4]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "fmla z20.h, z6.h, z3.h[5]\n" + "fmla z24.h, z6.h, z4.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "fmla z21.h, z7.h, z3.h[5]\n" + "fmla z25.h, z7.h, z4.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "fmla z22.h, z6.h, z3.h[5]\n" + "fmla z26.h, z6.h, z4.h[5]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z23.h, z7.h, z3.h[5]\n" + "fmla z27.h, z7.h, z4.h[5]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "fmla z20.h, z6.h, z3.h[6]\n" + "fmla z24.h, z6.h, z4.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "fmla z21.h, z7.h, z3.h[6]\n" + "fmla z25.h, z7.h, z4.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "fmla z22.h, z6.h, z3.h[6]\n" + "fmla z26.h, z6.h, z4.h[6]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z23.h, z7.h, z3.h[6]\n" + "fmla z27.h, z7.h, z4.h[6]\n" + "ble 68f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "fmla z20.h, z6.h, z3.h[7]\n" + "fmla z24.h, z6.h, z4.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "fmla z21.h, z7.h, z3.h[7]\n" + "fmla z25.h, z7.h, z4.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z22.h, z6.h, z3.h[7]\n" + "fmla z26.h, z6.h, z4.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z27.h, z7.h, z4.h[7]\n" + "68:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 63b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 69f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z1.h }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z0.h }, p5/Z, [x19]\n" + "fmin z8.h, p5/M, z8.h, z0.h\n" + "fmin z9.h, p5/M, z9.h, z0.h\n" + "fmin z10.h, p5/M, z10.h, z0.h\n" + "fmin z11.h, p5/M, z11.h, z0.h\n" + "fmin z12.h, p5/M, z12.h, z0.h\n" + "fmax z8.h, p5/M, z8.h, z1.h\n" + "fmax z9.h, p5/M, z9.h, z1.h\n" + "fmax z10.h, p5/M, z10.h, z1.h\n" + "fmax z11.h, p5/M, z11.h, z1.h\n" + "fmax z12.h, p5/M, z12.h, z1.h\n" + "fmin z13.h, p5/M, z13.h, z0.h\n" + "fmin z14.h, p5/M, z14.h, z0.h\n" + "fmin z15.h, p5/M, z15.h, z0.h\n" + "fmin z16.h, p5/M, z16.h, z0.h\n" + "fmax z13.h, p5/M, z13.h, z1.h\n" + "fmax z14.h, p5/M, z14.h, z1.h\n" + "fmax z15.h, p5/M, z15.h, z1.h\n" + "fmax z16.h, p5/M, z16.h, z1.h\n" + "fmin z17.h, p5/M, z17.h, z0.h\n" + "fmin z18.h, p5/M, z18.h, z0.h\n" + "fmin z19.h, p5/M, z19.h, z0.h\n" + "fmin z20.h, p5/M, z20.h, z0.h\n" + "fmax z17.h, p5/M, z17.h, z1.h\n" + "fmax z18.h, p5/M, z18.h, z1.h\n" + "fmax z19.h, p5/M, z19.h, z1.h\n" + "fmax z20.h, p5/M, z20.h, z1.h\n" + "fmin z21.h, p5/M, z21.h, z0.h\n" + "fmin z22.h, p5/M, z22.h, z0.h\n" + "fmin z23.h, p5/M, z23.h, z0.h\n" + "fmin z24.h, p5/M, z24.h, z0.h\n" + "fmax z21.h, p5/M, z21.h, z1.h\n" + "fmax z22.h, p5/M, z22.h, z1.h\n" + "fmax z23.h, p5/M, z23.h, z1.h\n" + "fmax z24.h, p5/M, z24.h, z1.h\n" + "fmin z25.h, p5/M, z25.h, z0.h\n" + "fmin z26.h, p5/M, z26.h, z0.h\n" + "fmin z27.h, p5/M, z27.h, z0.h\n" + "fmax z25.h, p5/M, z25.h, z1.h\n" + "fmax z26.h, p5/M, z26.h, z1.h\n" + "fmax z27.h, p5/M, z27.h, z1.h\n" + "69:" // Height 5: No activation + "st1h { z8.h }, p4, [x13]\n" + "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1h { z12.h }, p4, [x9]\n" + "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1h { z16.h }, p4, [x27]\n" + "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1h { z20.h }, p4, [x25]\n" + "st1h { z21.h }, p3, [x25, #1, MUL VL]\n" + "st1h { z22.h }, p2, [x25, #2, MUL VL]\n" + "st1h { z23.h }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1h { z24.h }, p4, [x23]\n" + "st1h { z25.h }, p3, [x23, #1, MUL VL]\n" + "st1h { z26.h }, p2, [x23, #2, MUL VL]\n" + "st1h { z27.h }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "70:" // Height 5: Writeback done + "mov x19, #0x0\n" + "inch x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 59b\n" + "b 86f\n" + "71:" // Height 6 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 72f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #1\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #1\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #1\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" + "b 73f\n" + "72:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #1\n" + "add x27, x9, x19, LSL #1\n" + "add x25, x27, x19, LSL #1\n" + "add x23, x25, x19, LSL #1\n" + "add x21, x23, x19, LSL #1\n" + "add %x[output_ptr], x21, x19, LSL #1\n" + "73:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.h, x19, x16\n" + "inch x19\n" + "whilelt p3.h, x19, x16\n" + "inch x19\n" + "whilelt p2.h, x19, x16\n" + "inch x19\n" + "whilelt p1.h, x19, x16\n" + "cbz x14, 74f\n" + "ld1h { z8.h }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "mov z28.d, z8.d\n" + "mov z29.d, z9.d\n" + "mov z30.d, z10.d\n" + "mov z31.d, z11.d\n" + "b 76f\n" + "74:" // Height 6: no bias + "tbz %x[flags], #0, 75f\n" + "ld1h { z8.h }, p4/Z, [x13]\n" + "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x9]\n" + "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x27]\n" + "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x25]\n" + "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x23]\n" + "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z28.h }, p4/Z, [x21]\n" + "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n" + "b 76f\n" + "75:" // Height 6: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "76:" // Height 6: setup done + "mov x12, #0x0\n" + "77:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 78f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 79f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 79f\n" + "78:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "add x20, x22, x19, LSL #1\n" + "79:" // Height 6: input setup done + "cmp x11, #0x8\n" + "ble 81f\n" + "80:" // Height 6: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "ld1rqh { z5.h }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "fmla z20.h, z6.h, z3.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x20, x20, #0x10\n" + "fmla z24.h, z6.h, z4.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x8\n" + "fmla z28.h, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z21.h, z7.h, z3.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z25.h, z7.h, z4.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla z29.h, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "fmla z22.h, z6.h, z3.h[0]\n" + "fmla z26.h, z6.h, z4.h[0]\n" + "fmla z30.h, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z23.h, z7.h, z3.h[0]\n" + "fmla z27.h, z7.h, z4.h[0]\n" + "fmla z31.h, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "fmla z20.h, z6.h, z3.h[1]\n" + "fmla z24.h, z6.h, z4.h[1]\n" + "fmla z28.h, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "fmla z21.h, z7.h, z3.h[1]\n" + "fmla z25.h, z7.h, z4.h[1]\n" + "fmla z29.h, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "fmla z22.h, z6.h, z3.h[1]\n" + "fmla z26.h, z6.h, z4.h[1]\n" + "fmla z30.h, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z23.h, z7.h, z3.h[1]\n" + "fmla z27.h, z7.h, z4.h[1]\n" + "fmla z31.h, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "fmla z20.h, z6.h, z3.h[2]\n" + "fmla z24.h, z6.h, z4.h[2]\n" + "fmla z28.h, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "fmla z21.h, z7.h, z3.h[2]\n" + "fmla z25.h, z7.h, z4.h[2]\n" + "fmla z29.h, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "fmla z22.h, z6.h, z3.h[2]\n" + "fmla z26.h, z6.h, z4.h[2]\n" + "fmla z30.h, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z23.h, z7.h, z3.h[2]\n" + "fmla z27.h, z7.h, z4.h[2]\n" + "fmla z31.h, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "fmla z20.h, z6.h, z3.h[3]\n" + "fmla z24.h, z6.h, z4.h[3]\n" + "fmla z28.h, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "fmla z21.h, z7.h, z3.h[3]\n" + "fmla z25.h, z7.h, z4.h[3]\n" + "fmla z29.h, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "fmla z22.h, z6.h, z3.h[3]\n" + "fmla z26.h, z6.h, z4.h[3]\n" + "fmla z30.h, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z23.h, z7.h, z3.h[3]\n" + "fmla z27.h, z7.h, z4.h[3]\n" + "fmla z31.h, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "fmla z20.h, z6.h, z3.h[4]\n" + "fmla z24.h, z6.h, z4.h[4]\n" + "fmla z28.h, z6.h, z5.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "fmla z21.h, z7.h, z3.h[4]\n" + "fmla z25.h, z7.h, z4.h[4]\n" + "fmla z29.h, z7.h, z5.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "fmla z22.h, z6.h, z3.h[4]\n" + "fmla z26.h, z6.h, z4.h[4]\n" + "fmla z30.h, z6.h, z5.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z23.h, z7.h, z3.h[4]\n" + "fmla z27.h, z7.h, z4.h[4]\n" + "fmla z31.h, z7.h, z5.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "fmla z20.h, z6.h, z3.h[5]\n" + "fmla z24.h, z6.h, z4.h[5]\n" + "fmla z28.h, z6.h, z5.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "fmla z21.h, z7.h, z3.h[5]\n" + "fmla z25.h, z7.h, z4.h[5]\n" + "fmla z29.h, z7.h, z5.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "fmla z22.h, z6.h, z3.h[5]\n" + "fmla z26.h, z6.h, z4.h[5]\n" + "fmla z30.h, z6.h, z5.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z23.h, z7.h, z3.h[5]\n" + "fmla z27.h, z7.h, z4.h[5]\n" + "fmla z31.h, z7.h, z5.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "fmla z20.h, z6.h, z3.h[6]\n" + "fmla z24.h, z6.h, z4.h[6]\n" + "fmla z28.h, z6.h, z5.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "fmla z21.h, z7.h, z3.h[6]\n" + "fmla z25.h, z7.h, z4.h[6]\n" + "fmla z29.h, z7.h, z5.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "fmla z22.h, z6.h, z3.h[6]\n" + "fmla z26.h, z6.h, z4.h[6]\n" + "fmla z30.h, z6.h, z5.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z23.h, z7.h, z3.h[6]\n" + "fmla z27.h, z7.h, z4.h[6]\n" + "fmla z31.h, z7.h, z5.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "fmla z20.h, z6.h, z3.h[7]\n" + "fmla z24.h, z6.h, z4.h[7]\n" + "fmla z28.h, z6.h, z5.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "fmla z21.h, z7.h, z3.h[7]\n" + "fmla z25.h, z7.h, z4.h[7]\n" + "fmla z29.h, z7.h, z5.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z22.h, z6.h, z3.h[7]\n" + "fmla z26.h, z6.h, z4.h[7]\n" + "fmla z30.h, z6.h, z5.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z27.h, z7.h, z4.h[7]\n" + "fmla z31.h, z7.h, z5.h[7]\n" + "bgt 80b\n" + "81:" // Height 6: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x15]\n" + "whilelt p0.h, XZR, x11\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x10]\n" + "fmla z8.h, z6.h, z0.h[0]\n" + "ld1rqh { z1.h }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.h, z7.h, z0.h[0]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.h, z6.h, z1.h[0]\n" + "ld1rqh { z3.h }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.h, z6.h, z2.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "ld1rqh { z5.h }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "fmla z20.h, z6.h, z3.h[0]\n" + "add x20, x20, #0x10\n" + "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z24.h, z6.h, z4.h[0]\n" + "fmla z28.h, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z21.h, z7.h, z3.h[0]\n" + "fmla z25.h, z7.h, z4.h[0]\n" + "fmla z29.h, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[0]\n" + "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z18.h, z6.h, z2.h[0]\n" + "fmla z22.h, z6.h, z3.h[0]\n" + "fmla z26.h, z6.h, z4.h[0]\n" + "fmla z30.h, z6.h, z5.h[0]\n" + "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z23.h, z7.h, z3.h[0]\n" + "fmla z27.h, z7.h, z4.h[0]\n" + "fmla z31.h, z7.h, z5.h[0]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[1]\n" + "fmla z16.h, z6.h, z2.h[1]\n" + "fmla z20.h, z6.h, z3.h[1]\n" + "fmla z24.h, z6.h, z4.h[1]\n" + "fmla z28.h, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[1]\n" + "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z17.h, z7.h, z2.h[1]\n" + "fmla z21.h, z7.h, z3.h[1]\n" + "fmla z25.h, z7.h, z4.h[1]\n" + "fmla z29.h, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[1]\n" + "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z18.h, z6.h, z2.h[1]\n" + "fmla z22.h, z6.h, z3.h[1]\n" + "fmla z26.h, z6.h, z4.h[1]\n" + "fmla z30.h, z6.h, z5.h[1]\n" + "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z23.h, z7.h, z3.h[1]\n" + "fmla z27.h, z7.h, z4.h[1]\n" + "fmla z31.h, z7.h, z5.h[1]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[2]\n" + "fmla z16.h, z6.h, z2.h[2]\n" + "fmla z20.h, z6.h, z3.h[2]\n" + "fmla z24.h, z6.h, z4.h[2]\n" + "fmla z28.h, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[2]\n" + "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z17.h, z7.h, z2.h[2]\n" + "fmla z21.h, z7.h, z3.h[2]\n" + "fmla z25.h, z7.h, z4.h[2]\n" + "fmla z29.h, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[2]\n" + "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z18.h, z6.h, z2.h[2]\n" + "fmla z22.h, z6.h, z3.h[2]\n" + "fmla z26.h, z6.h, z4.h[2]\n" + "fmla z30.h, z6.h, z5.h[2]\n" + "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z23.h, z7.h, z3.h[2]\n" + "fmla z27.h, z7.h, z4.h[2]\n" + "fmla z31.h, z7.h, z5.h[2]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[3]\n" + "fmla z16.h, z6.h, z2.h[3]\n" + "fmla z20.h, z6.h, z3.h[3]\n" + "fmla z24.h, z6.h, z4.h[3]\n" + "fmla z28.h, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[3]\n" + "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z17.h, z7.h, z2.h[3]\n" + "fmla z21.h, z7.h, z3.h[3]\n" + "fmla z25.h, z7.h, z4.h[3]\n" + "fmla z29.h, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[3]\n" + "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z18.h, z6.h, z2.h[3]\n" + "fmla z22.h, z6.h, z3.h[3]\n" + "fmla z26.h, z6.h, z4.h[3]\n" + "fmla z30.h, z6.h, z5.h[3]\n" + "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z23.h, z7.h, z3.h[3]\n" + "fmla z27.h, z7.h, z4.h[3]\n" + "fmla z31.h, z7.h, z5.h[3]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[4]\n" + "fmla z16.h, z6.h, z2.h[4]\n" + "fmla z20.h, z6.h, z3.h[4]\n" + "fmla z24.h, z6.h, z4.h[4]\n" + "fmla z28.h, z6.h, z5.h[4]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[4]\n" + "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z17.h, z7.h, z2.h[4]\n" + "fmla z21.h, z7.h, z3.h[4]\n" + "fmla z25.h, z7.h, z4.h[4]\n" + "fmla z29.h, z7.h, z5.h[4]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[4]\n" + "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z18.h, z6.h, z2.h[4]\n" + "fmla z22.h, z6.h, z3.h[4]\n" + "fmla z26.h, z6.h, z4.h[4]\n" + "fmla z30.h, z6.h, z5.h[4]\n" + "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z23.h, z7.h, z3.h[4]\n" + "fmla z27.h, z7.h, z4.h[4]\n" + "fmla z31.h, z7.h, z5.h[4]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[5]\n" + "fmla z16.h, z6.h, z2.h[5]\n" + "fmla z20.h, z6.h, z3.h[5]\n" + "fmla z24.h, z6.h, z4.h[5]\n" + "fmla z28.h, z6.h, z5.h[5]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[5]\n" + "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z17.h, z7.h, z2.h[5]\n" + "fmla z21.h, z7.h, z3.h[5]\n" + "fmla z25.h, z7.h, z4.h[5]\n" + "fmla z29.h, z7.h, z5.h[5]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[5]\n" + "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z18.h, z6.h, z2.h[5]\n" + "fmla z22.h, z6.h, z3.h[5]\n" + "fmla z26.h, z6.h, z4.h[5]\n" + "fmla z30.h, z6.h, z5.h[5]\n" + "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z23.h, z7.h, z3.h[5]\n" + "fmla z27.h, z7.h, z4.h[5]\n" + "fmla z31.h, z7.h, z5.h[5]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.h, z6.h, z1.h[6]\n" + "fmla z16.h, z6.h, z2.h[6]\n" + "fmla z20.h, z6.h, z3.h[6]\n" + "fmla z24.h, z6.h, z4.h[6]\n" + "fmla z28.h, z6.h, z5.h[6]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[6]\n" + "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z17.h, z7.h, z2.h[6]\n" + "fmla z21.h, z7.h, z3.h[6]\n" + "fmla z25.h, z7.h, z4.h[6]\n" + "fmla z29.h, z7.h, z5.h[6]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[6]\n" + "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z18.h, z6.h, z2.h[6]\n" + "fmla z22.h, z6.h, z3.h[6]\n" + "fmla z26.h, z6.h, z4.h[6]\n" + "fmla z30.h, z6.h, z5.h[6]\n" + "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z23.h, z7.h, z3.h[6]\n" + "fmla z27.h, z7.h, z4.h[6]\n" + "fmla z31.h, z7.h, z5.h[6]\n" + "ble 82f\n" + "ld1h { z6.h }, p5/Z, [x15]\n" + "fmla z8.h, z6.h, z0.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.h, z6.h, z1.h[7]\n" + "fmla z16.h, z6.h, z2.h[7]\n" + "fmla z20.h, z6.h, z3.h[7]\n" + "fmla z24.h, z6.h, z4.h[7]\n" + "fmla z28.h, z6.h, z5.h[7]\n" + "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.h, z7.h, z0.h[7]\n" + "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z17.h, z7.h, z2.h[7]\n" + "fmla z21.h, z7.h, z3.h[7]\n" + "fmla z25.h, z7.h, z4.h[7]\n" + "fmla z29.h, z7.h, z5.h[7]\n" + "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.h, z6.h, z0.h[7]\n" + "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z18.h, z6.h, z2.h[7]\n" + "fmla z22.h, z6.h, z3.h[7]\n" + "fmla z26.h, z6.h, z4.h[7]\n" + "fmla z30.h, z6.h, z5.h[7]\n" + "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z27.h, z7.h, z4.h[7]\n" + "fmla z31.h, z7.h, z5.h[7]\n" + "82:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 77b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 83f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rh { z1.h }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rh { z0.h }, p5/Z, [x19]\n" + "fmin z8.h, p5/M, z8.h, z0.h\n" + "fmin z9.h, p5/M, z9.h, z0.h\n" + "fmin z10.h, p5/M, z10.h, z0.h\n" + "fmin z11.h, p5/M, z11.h, z0.h\n" + "fmin z12.h, p5/M, z12.h, z0.h\n" + "fmax z8.h, p5/M, z8.h, z1.h\n" + "fmax z9.h, p5/M, z9.h, z1.h\n" + "fmax z10.h, p5/M, z10.h, z1.h\n" + "fmax z11.h, p5/M, z11.h, z1.h\n" + "fmax z12.h, p5/M, z12.h, z1.h\n" + "fmin z13.h, p5/M, z13.h, z0.h\n" + "fmin z14.h, p5/M, z14.h, z0.h\n" + "fmin z15.h, p5/M, z15.h, z0.h\n" + "fmin z16.h, p5/M, z16.h, z0.h\n" + "fmax z13.h, p5/M, z13.h, z1.h\n" + "fmax z14.h, p5/M, z14.h, z1.h\n" + "fmax z15.h, p5/M, z15.h, z1.h\n" + "fmax z16.h, p5/M, z16.h, z1.h\n" + "fmin z17.h, p5/M, z17.h, z0.h\n" + "fmin z18.h, p5/M, z18.h, z0.h\n" + "fmin z19.h, p5/M, z19.h, z0.h\n" + "fmin z20.h, p5/M, z20.h, z0.h\n" + "fmax z17.h, p5/M, z17.h, z1.h\n" + "fmax z18.h, p5/M, z18.h, z1.h\n" + "fmax z19.h, p5/M, z19.h, z1.h\n" + "fmax z20.h, p5/M, z20.h, z1.h\n" + "fmin z21.h, p5/M, z21.h, z0.h\n" + "fmin z22.h, p5/M, z22.h, z0.h\n" + "fmin z23.h, p5/M, z23.h, z0.h\n" + "fmin z24.h, p5/M, z24.h, z0.h\n" + "fmax z21.h, p5/M, z21.h, z1.h\n" + "fmax z22.h, p5/M, z22.h, z1.h\n" + "fmax z23.h, p5/M, z23.h, z1.h\n" + "fmax z24.h, p5/M, z24.h, z1.h\n" + "fmin z25.h, p5/M, z25.h, z0.h\n" + "fmin z26.h, p5/M, z26.h, z0.h\n" + "fmin z27.h, p5/M, z27.h, z0.h\n" + "fmin z28.h, p5/M, z28.h, z0.h\n" + "fmax z25.h, p5/M, z25.h, z1.h\n" + "fmax z26.h, p5/M, z26.h, z1.h\n" + "fmax z27.h, p5/M, z27.h, z1.h\n" + "fmax z28.h, p5/M, z28.h, z1.h\n" + "fmin z29.h, p5/M, z29.h, z0.h\n" + "fmin z30.h, p5/M, z30.h, z0.h\n" + "fmin z31.h, p5/M, z31.h, z0.h\n" + "fmax z29.h, p5/M, z29.h, z1.h\n" + "fmax z30.h, p5/M, z30.h, z1.h\n" + "fmax z31.h, p5/M, z31.h, z1.h\n" + "83:" // Height 6: No activation + "st1h { z8.h }, p4, [x13]\n" + "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1h { z12.h }, p4, [x9]\n" + "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1h { z16.h }, p4, [x27]\n" + "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1h { z20.h }, p4, [x25]\n" + "st1h { z21.h }, p3, [x25, #1, MUL VL]\n" + "st1h { z22.h }, p2, [x25, #2, MUL VL]\n" + "st1h { z23.h }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1h { z24.h }, p4, [x23]\n" + "st1h { z25.h }, p3, [x23, #1, MUL VL]\n" + "st1h { z26.h }, p2, [x23, #2, MUL VL]\n" + "st1h { z27.h }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "st1h { z28.h }, p4, [x21]\n" + "st1h { z29.h }, p3, [x21, #1, MUL VL]\n" + "st1h { z30.h }, p2, [x21, #2, MUL VL]\n" + "st1h { z31.h }, p1, [x21, #3, MUL VL]\n" + "addvl x21, x21, #4\n" + "84:" // Height 6: Writeback done + "mov x19, #0x0\n" + "inch x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 73b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 86f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 85f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "85:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "86:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp deleted file mode 100644 index 1bc8021e76..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - - -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); - -class hybrid_fp32_mla_4VLx4 -{ -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return get_vector_length() * 4; - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_fp32_mla_4VLx4; - - hybrid_fp32_mla_4VLx4(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp deleted file mode 100644 index ce3624340e..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp +++ /dev/null @@ -1,2118 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = K; - const long loops_count = ((K + 4) / 8) - 1; - K -= loops_count * 8; - const long regs_count = (K / 4) - 1; - K -= (regs_count + 1) * 4; - const long leftovers = K; - float nullbias[256]; - if (!accumulate && !bias) { - memset(nullbias, 0, (4 * get_vector_length() * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = leftovers; - const float *a_ptr0 = a_ptr0_base; - const float *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(float); - const float *biasptr = bias ? bias+x0 : nullbias; - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "b 5f\n" - "4:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "mov z23.d, z19.d\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "fmla z20.s, z12.s, z5.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "fmla z21.s, z13.s, z5.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "fmla z22.s, z14.s, z5.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "fmla z23.s, z15.s, z5.s[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "fmla z20.s, z12.s, z5.s[3]\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "fmla z21.s, z13.s, z5.s[3]\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "fmla z22.s, z14.s, z5.s[3]\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "fmla z23.s, z15.s, z5.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "b 5f\n" - "4:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z23.d, z19.d\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "mov z24.d, z16.d\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z25.d, z17.d\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z27.d, z19.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "fmla z24.s, z12.s, z2.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "fmla z25.s, z13.s, z2.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "fmla z26.s, z14.s, z2.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - "fmla z27.s, z15.s, z2.s[3]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "fmla z24.s, z8.s, z6.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "fmla z25.s, z9.s, z6.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "fmla z26.s, z10.s, z6.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "fmla z27.s, z11.s, z6.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "fmla z24.s, z12.s, z6.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "fmla z25.s, z13.s, z6.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "fmla z26.s, z14.s, z6.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "fmla z27.s, z15.s, z6.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z24.s, z8.s, z6.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z25.s, z9.s, z6.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z26.s, z10.s, z6.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "fmla z27.s, z11.s, z6.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "fmla z20.s, z12.s, z5.s[3]\n" - "fmla z24.s, z12.s, z6.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "fmla z21.s, z13.s, z5.s[3]\n" - "fmla z25.s, z13.s, z6.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "fmla z22.s, z14.s, z5.s[3]\n" - "fmla z26.s, z14.s, z6.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "fmla z23.s, z15.s, z5.s[3]\n" - "fmla z27.s, z15.s, z6.s[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "fmla z24.s, z12.s, z2.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "fmla z25.s, z13.s, z2.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "fmla z26.s, z14.s, z2.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z27.s, z15.s, z2.s[3]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z24.s, z8.s, z6.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "fmla z25.s, z9.s, z6.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "fmla z26.s, z10.s, z6.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "fmla z27.s, z11.s, z6.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "fmla z24.s, z12.s, z6.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "fmla z25.s, z13.s, z6.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "fmla z26.s, z14.s, z6.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "fmla z27.s, z15.s, z6.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z24.s, z8.s, z6.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z25.s, z9.s, z6.s[2]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z26.s, z10.s, z6.s[2]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "fmla z27.s, z11.s, z6.s[2]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "fmla z20.s, z12.s, z5.s[3]\n" - "fmla z24.s, z12.s, z6.s[3]\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "fmla z21.s, z13.s, z5.s[3]\n" - "fmla z25.s, z13.s, z6.s[3]\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "fmla z22.s, z14.s, z5.s[3]\n" - "fmla z26.s, z14.s, z6.s[3]\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "fmla z23.s, z15.s, z5.s[3]\n" - "fmla z27.s, z15.s, z6.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "b 5f\n" - "4:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "fmla z24.s, z12.s, z2.s[3]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "fmla z25.s, z13.s, z2.s[3]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "fmla z26.s, z14.s, z2.s[3]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "fmla z27.s, z15.s, z2.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "fmla z24.s, z8.s, z6.s[0]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "fmla z25.s, z9.s, z6.s[0]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "fmla z26.s, z10.s, z6.s[0]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "fmla z27.s, z11.s, z6.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "fmla z24.s, z12.s, z6.s[1]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "fmla z25.s, z13.s, z6.s[1]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "fmla z26.s, z14.s, z6.s[1]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "fmla z27.s, z15.s, z6.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z24.s, z8.s, z6.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z25.s, z9.s, z6.s[2]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z26.s, z10.s, z6.s[2]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "fmla z27.s, z11.s, z6.s[2]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z20.s, p0, [c_ptr1]\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "st1w z24.s, p0, [c_ptr2]\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z16.s, p0/z, [%[biasptr]]\n" - "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "mov z21.d, z17.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z22.d, z18.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z23.d, z19.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z24.d, z16.d\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "mov z25.d, z17.d\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z26.d, z18.d\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z27.d, z19.d\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z28.d, z16.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z29.d, z17.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z30.d, z18.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z31.d, z19.d\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "add a_ptr3, a_ptr3, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1w z28.s, p0/z, [c_ptr3]\n" - "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" - "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - "fmla z28.s, z8.s, z3.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "subs %[loops], %[loops], #0x1\n" - "fmla z29.s, z9.s, z3.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z30.s, z10.s, z3.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "fmla z31.s, z11.s, z3.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "fmla z28.s, z12.s, z3.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "fmla z29.s, z13.s, z3.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "fmla z30.s, z14.s, z3.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "fmla z31.s, z15.s, z3.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z28.s, z8.s, z3.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "fmla z29.s, z9.s, z3.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z30.s, z10.s, z3.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "fmla z31.s, z11.s, z3.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "fmla z24.s, z12.s, z2.s[3]\n" - "fmla z28.s, z12.s, z3.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "fmla z25.s, z13.s, z2.s[3]\n" - "fmla z29.s, z13.s, z3.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "fmla z26.s, z14.s, z2.s[3]\n" - "fmla z30.s, z14.s, z3.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - "fmla z27.s, z15.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - "fmla z31.s, z15.s, z3.s[3]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "fmla z24.s, z8.s, z6.s[0]\n" - "fmla z28.s, z8.s, z7.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "fmla z25.s, z9.s, z6.s[0]\n" - "fmla z29.s, z9.s, z7.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "fmla z26.s, z10.s, z6.s[0]\n" - "fmla z30.s, z10.s, z7.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "fmla z27.s, z11.s, z6.s[0]\n" - "fmla z31.s, z11.s, z7.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "fmla z24.s, z12.s, z6.s[1]\n" - "fmla z28.s, z12.s, z7.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "fmla z25.s, z13.s, z6.s[1]\n" - "fmla z29.s, z13.s, z7.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "fmla z26.s, z14.s, z6.s[1]\n" - "fmla z30.s, z14.s, z7.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "fmla z27.s, z15.s, z6.s[1]\n" - "fmla z31.s, z15.s, z7.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z24.s, z8.s, z6.s[2]\n" - "fmla z28.s, z8.s, z7.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z25.s, z9.s, z6.s[2]\n" - "fmla z29.s, z9.s, z7.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z26.s, z10.s, z6.s[2]\n" - "fmla z30.s, z10.s, z7.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "fmla z27.s, z11.s, z6.s[2]\n" - "fmla z31.s, z11.s, z7.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "fmla z20.s, z12.s, z5.s[3]\n" - "fmla z24.s, z12.s, z6.s[3]\n" - "fmla z28.s, z12.s, z7.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "fmla z21.s, z13.s, z5.s[3]\n" - "fmla z25.s, z13.s, z6.s[3]\n" - "fmla z29.s, z13.s, z7.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "fmla z22.s, z14.s, z5.s[3]\n" - "fmla z26.s, z14.s, z6.s[3]\n" - "fmla z30.s, z14.s, z7.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "fmla z23.s, z15.s, z5.s[3]\n" - "fmla z27.s, z15.s, z6.s[3]\n" - "fmla z31.s, z15.s, z7.s[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - "fmla z28.s, z8.s, z3.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "fmla z29.s, z9.s, z3.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "fmla z30.s, z10.s, z3.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "fmla z31.s, z11.s, z3.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "fmla z28.s, z12.s, z3.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "fmla z29.s, z13.s, z3.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "fmla z30.s, z14.s, z3.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "fmla z31.s, z15.s, z3.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z28.s, z8.s, z3.s[2]\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "fmla z29.s, z9.s, z3.s[2]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z30.s, z10.s, z3.s[2]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "fmla z31.s, z11.s, z3.s[2]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "fmla z24.s, z12.s, z2.s[3]\n" - "fmla z28.s, z12.s, z3.s[3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "fmla z25.s, z13.s, z2.s[3]\n" - "fmla z29.s, z13.s, z3.s[3]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "fmla z26.s, z14.s, z2.s[3]\n" - "fmla z30.s, z14.s, z3.s[3]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z27.s, z15.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z31.s, z15.s, z3.s[3]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z24.s, z8.s, z6.s[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "fmla z28.s, z8.s, z7.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "addvl a_ptr3, a_ptr3, #2\n" - "fmla z25.s, z9.s, z6.s[0]\n" - "fmla z29.s, z9.s, z7.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "fmla z26.s, z10.s, z6.s[0]\n" - "fmla z30.s, z10.s, z7.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "fmla z27.s, z11.s, z6.s[0]\n" - "fmla z31.s, z11.s, z7.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "fmla z24.s, z12.s, z6.s[1]\n" - "fmla z28.s, z12.s, z7.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "fmla z25.s, z13.s, z6.s[1]\n" - "fmla z29.s, z13.s, z7.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "fmla z26.s, z14.s, z6.s[1]\n" - "fmla z30.s, z14.s, z7.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "fmla z27.s, z15.s, z6.s[1]\n" - "fmla z31.s, z15.s, z7.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z24.s, z8.s, z6.s[2]\n" - "fmla z28.s, z8.s, z7.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z25.s, z9.s, z6.s[2]\n" - "fmla z29.s, z9.s, z7.s[2]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z26.s, z10.s, z6.s[2]\n" - "fmla z30.s, z10.s, z7.s[2]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "fmla z27.s, z11.s, z6.s[2]\n" - "fmla z31.s, z11.s, z7.s[2]\n" - "fmla z16.s, z12.s, z4.s[3]\n" - "fmla z20.s, z12.s, z5.s[3]\n" - "fmla z24.s, z12.s, z6.s[3]\n" - "fmla z28.s, z12.s, z7.s[3]\n" - "fmla z17.s, z13.s, z4.s[3]\n" - "fmla z21.s, z13.s, z5.s[3]\n" - "fmla z25.s, z13.s, z6.s[3]\n" - "fmla z29.s, z13.s, z7.s[3]\n" - "fmla z18.s, z14.s, z4.s[3]\n" - "fmla z22.s, z14.s, z5.s[3]\n" - "fmla z26.s, z14.s, z6.s[3]\n" - "fmla z30.s, z14.s, z7.s[3]\n" - "fmla z19.s, z15.s, z4.s[3]\n" - "fmla z23.s, z15.s, z5.s[3]\n" - "fmla z27.s, z15.s, z6.s[3]\n" - "fmla z31.s, z15.s, z7.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "fmla z28.s, z8.s, z3.s[0]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "fmla z29.s, z9.s, z3.s[0]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "fmla z30.s, z10.s, z3.s[0]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "fmla z31.s, z11.s, z3.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "fmla z28.s, z12.s, z3.s[1]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "fmla z29.s, z13.s, z3.s[1]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "fmla z30.s, z14.s, z3.s[1]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "fmla z31.s, z15.s, z3.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z28.s, z8.s, z3.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "fmla z29.s, z9.s, z3.s[2]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z30.s, z10.s, z3.s[2]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "fmla z31.s, z11.s, z3.s[2]\n" - "b 5f\n" - "4:\n" - "fmla z16.s, z8.s, z0.s[0]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - "fmla z28.s, z8.s, z3.s[0]\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z0.s[0]\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - "fmla z21.s, z9.s, z1.s[0]\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" - "fmla z25.s, z9.s, z2.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "fmla z29.s, z9.s, z3.s[0]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z0.s[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "fmla z22.s, z10.s, z1.s[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "fmla z26.s, z10.s, z2.s[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" - "fmla z30.s, z10.s, z3.s[0]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z0.s[0]\n" - "fmla z23.s, z11.s, z1.s[0]\n" - "fmla z27.s, z11.s, z2.s[0]\n" - "fmla z31.s, z11.s, z3.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z12.s, z0.s[1]\n" - "fmla z20.s, z12.s, z1.s[1]\n" - "fmla z24.s, z12.s, z2.s[1]\n" - "fmla z28.s, z12.s, z3.s[1]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z17.s, z13.s, z0.s[1]\n" - "fmla z21.s, z13.s, z1.s[1]\n" - "fmla z25.s, z13.s, z2.s[1]\n" - "fmla z29.s, z13.s, z3.s[1]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z18.s, z14.s, z0.s[1]\n" - "fmla z22.s, z14.s, z1.s[1]\n" - "fmla z26.s, z14.s, z2.s[1]\n" - "fmla z30.s, z14.s, z3.s[1]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z19.s, z15.s, z0.s[1]\n" - "fmla z23.s, z15.s, z1.s[1]\n" - "fmla z27.s, z15.s, z2.s[1]\n" - "fmla z31.s, z15.s, z3.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z16.s, z8.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z20.s, z8.s, z1.s[2]\n" - "fmla z24.s, z8.s, z2.s[2]\n" - "fmla z28.s, z8.s, z3.s[2]\n" - "fmla z17.s, z9.s, z0.s[2]\n" - "fmla z21.s, z9.s, z1.s[2]\n" - "fmla z25.s, z9.s, z2.s[2]\n" - "fmla z29.s, z9.s, z3.s[2]\n" - "fmla z18.s, z10.s, z0.s[2]\n" - "fmla z22.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z30.s, z10.s, z3.s[2]\n" - "fmla z19.s, z11.s, z0.s[2]\n" - "fmla z23.s, z11.s, z1.s[2]\n" - "fmla z27.s, z11.s, z2.s[2]\n" - "fmla z31.s, z11.s, z3.s[2]\n" - "fmla z16.s, z12.s, z0.s[3]\n" - "fmla z20.s, z12.s, z1.s[3]\n" - "fmla z24.s, z12.s, z2.s[3]\n" - "fmla z28.s, z12.s, z3.s[3]\n" - "fmla z17.s, z13.s, z0.s[3]\n" - "fmla z21.s, z13.s, z1.s[3]\n" - "fmla z25.s, z13.s, z2.s[3]\n" - "fmla z29.s, z13.s, z3.s[3]\n" - "fmla z18.s, z14.s, z0.s[3]\n" - "fmla z22.s, z14.s, z1.s[3]\n" - "fmla z26.s, z14.s, z2.s[3]\n" - "fmla z30.s, z14.s, z3.s[3]\n" - "fmla z19.s, z15.s, z0.s[3]\n" - "fmla z23.s, z15.s, z1.s[3]\n" - "fmla z27.s, z15.s, z2.s[3]\n" - "fmla z31.s, z15.s, z3.s[3]\n" - "cbz %[blocks], 5f\n" - "ld1w z8.s, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z20.s, z8.s, z5.s[0]\n" - "fmla z24.s, z8.s, z6.s[0]\n" - "fmla z28.s, z8.s, z7.s[0]\n" - "fmla z17.s, z9.s, z4.s[0]\n" - "fmla z21.s, z9.s, z5.s[0]\n" - "fmla z25.s, z9.s, z6.s[0]\n" - "fmla z29.s, z9.s, z7.s[0]\n" - "fmla z18.s, z10.s, z4.s[0]\n" - "fmla z22.s, z10.s, z5.s[0]\n" - "fmla z26.s, z10.s, z6.s[0]\n" - "fmla z30.s, z10.s, z7.s[0]\n" - "fmla z19.s, z11.s, z4.s[0]\n" - "fmla z23.s, z11.s, z5.s[0]\n" - "fmla z27.s, z11.s, z6.s[0]\n" - "fmla z31.s, z11.s, z7.s[0]\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z16.s, z12.s, z4.s[1]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z20.s, z12.s, z5.s[1]\n" - "fmla z24.s, z12.s, z6.s[1]\n" - "fmla z28.s, z12.s, z7.s[1]\n" - "fmla z17.s, z13.s, z4.s[1]\n" - "fmla z21.s, z13.s, z5.s[1]\n" - "fmla z25.s, z13.s, z6.s[1]\n" - "fmla z29.s, z13.s, z7.s[1]\n" - "fmla z18.s, z14.s, z4.s[1]\n" - "fmla z22.s, z14.s, z5.s[1]\n" - "fmla z26.s, z14.s, z6.s[1]\n" - "fmla z30.s, z14.s, z7.s[1]\n" - "fmla z19.s, z15.s, z4.s[1]\n" - "fmla z23.s, z15.s, z5.s[1]\n" - "fmla z27.s, z15.s, z6.s[1]\n" - "fmla z31.s, z15.s, z7.s[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[2]\n" - "fmla z20.s, z8.s, z5.s[2]\n" - "fmla z24.s, z8.s, z6.s[2]\n" - "fmla z28.s, z8.s, z7.s[2]\n" - "fmla z17.s, z9.s, z4.s[2]\n" - "fmla z21.s, z9.s, z5.s[2]\n" - "fmla z25.s, z9.s, z6.s[2]\n" - "fmla z29.s, z9.s, z7.s[2]\n" - "fmla z18.s, z10.s, z4.s[2]\n" - "fmla z22.s, z10.s, z5.s[2]\n" - "fmla z26.s, z10.s, z6.s[2]\n" - "fmla z30.s, z10.s, z7.s[2]\n" - "fmla z19.s, z11.s, z4.s[2]\n" - "fmla z23.s, z11.s, z5.s[2]\n" - "fmla z27.s, z11.s, z6.s[2]\n" - "fmla z31.s, z11.s, z7.s[2]\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z20.s, p0, [c_ptr1]\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "st1w z24.s, p0, [c_ptr2]\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - "st1w z28.s, p0, [c_ptr3]\n" - "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" - "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" - "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp new file mode 100644 index 0000000000..f0cc70b76e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const float *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_fp32_mla_6x4VL( ARGLIST ); + +class cls_sve_hybrid_fp32_mla_6x4VL +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32_mla_6x4VL; + + cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp new file mode 100644 index 0000000000..3a6422abd1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp @@ -0,0 +1,2236 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_fp32_mla_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 71f\n" + "cmp %x[M], #0x4\n" + "bgt 57f\n" + "beq 43f\n" + "cmp %x[M], #0x2\n" + "bgt 29f\n" + "beq 15f\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 4f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "b 6f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 5f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "b 6f\n" + "5:" // Height 1: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "6:" // Height 1: setup done + "mov x12, #0x0\n" + "7:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 8f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 9f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "b 9f\n" + "8:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "9:" // Height 1: input setup done + "cmp x11, #0x4\n" + "ble 11f\n" + "10:" // Height 1: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "cmp x11, #0x4\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "bgt 10b\n" + "11:" // Height 1: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "ble 12f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "addvl x15, x15, #4\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "ble 12f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "addvl x15, x15, #4\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "ble 12f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "12:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 7b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 13f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "13:" // Height 1: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "14:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 3b\n" + "b 86f\n" + "15:" // Height 2 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 16f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 17f\n" + "16:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "17:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 18f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z13.d, z9.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "b 20f\n" + "18:" // Height 2: no bias + "tbz %x[flags], #0, 19f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "b 20f\n" + "19:" // Height 2: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "20:" // Height 2: setup done + "mov x12, #0x0\n" + "21:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 22f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 23f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "b 23f\n" + "22:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "23:" // Height 2: input setup done + "cmp x11, #0x4\n" + "ble 25f\n" + "24:" // Height 2: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "cmp x11, #0x4\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "bgt 24b\n" + "25:" // Height 2: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "ble 26f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "ble 26f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "ble 26f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "26:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 21b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "27:" // Height 2: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "28:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 17b\n" + "b 86f\n" + "29:" // Height 3 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 30f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 31f\n" + "30:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "31:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 32f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "mov z13.d, z9.d\n" + "addvl x14, x14, #4\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "b 34f\n" + "32:" // Height 3: no bias + "tbz %x[flags], #0, 33f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "b 34f\n" + "33:" // Height 3: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "34:" // Height 3: setup done + "mov x12, #0x0\n" + "35:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 37f\n" + "36:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "37:" // Height 3: input setup done + "cmp x11, #0x4\n" + "ble 39f\n" + "38:" // Height 3: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x26, x26, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "cmp x11, #0x4\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "bgt 38b\n" + "39:" // Height 3: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "add x26, x26, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "ble 40f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "ble 40f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "ble 40f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "40:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 35b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 41f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "41:" // Height 3: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "42:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 31b\n" + "b 86f\n" + "43:" // Height 4 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 44f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 45f\n" + "44:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "45:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 46f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "b 48f\n" + "46:" // Height 4: no bias + "tbz %x[flags], #0, 47f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "b 48f\n" + "47:" // Height 4: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "48:" // Height 4: setup done + "mov x12, #0x0\n" + "49:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 51f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 51f\n" + "50:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "51:" // Height 4: input setup done + "cmp x11, #0x4\n" + "ble 53f\n" + "52:" // Height 4: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x4\n" + "fmla z20.s, z6.s, z3.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z21.s, z7.s, z3.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "fmla z22.s, z6.s, z3.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z23.s, z7.s, z3.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "fmla z20.s, z6.s, z3.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "fmla z21.s, z7.s, z3.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "fmla z22.s, z6.s, z3.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z23.s, z7.s, z3.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "fmla z20.s, z6.s, z3.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "fmla z21.s, z7.s, z3.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "fmla z22.s, z6.s, z3.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z23.s, z7.s, z3.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "fmla z20.s, z6.s, z3.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "fmla z21.s, z7.s, z3.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z22.s, z6.s, z3.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z23.s, z7.s, z3.s[3]\n" + "bgt 52b\n" + "53:" // Height 4: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "add x24, x24, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z20.s, z6.s, z3.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z21.s, z7.s, z3.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "fmla z22.s, z6.s, z3.s[0]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z23.s, z7.s, z3.s[0]\n" + "ble 54f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "fmla z20.s, z6.s, z3.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "fmla z21.s, z7.s, z3.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "fmla z22.s, z6.s, z3.s[1]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z23.s, z7.s, z3.s[1]\n" + "ble 54f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "fmla z20.s, z6.s, z3.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "fmla z21.s, z7.s, z3.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "fmla z22.s, z6.s, z3.s[2]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z23.s, z7.s, z3.s[2]\n" + "ble 54f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "fmla z20.s, z6.s, z3.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "fmla z21.s, z7.s, z3.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z22.s, z6.s, z3.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z23.s, z7.s, z3.s[3]\n" + "54:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 49b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 55f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z23.s, p5/M, z23.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z23.s, p5/M, z23.s, z1.s\n" + "55:" // Height 4: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "56:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 45b\n" + "b 86f\n" + "57:" // Height 5 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 58f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 59f\n" + "58:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "59:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 60f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "b 62f\n" + "60:" // Height 5: no bias + "tbz %x[flags], #0, 61f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 62f\n" + "61:" // Height 5: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "62:" // Height 5: setup done + "mov x12, #0x0\n" + "63:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 64f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 65f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 65f\n" + "64:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "65:" // Height 5: input setup done + "cmp x11, #0x4\n" + "ble 67f\n" + "66:" // Height 5: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x22, x22, #0x10\n" + "fmla z20.s, z6.s, z3.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x4\n" + "fmla z24.s, z6.s, z4.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z21.s, z7.s, z3.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z25.s, z7.s, z4.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "fmla z22.s, z6.s, z3.s[0]\n" + "fmla z26.s, z6.s, z4.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z23.s, z7.s, z3.s[0]\n" + "fmla z27.s, z7.s, z4.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "fmla z20.s, z6.s, z3.s[1]\n" + "fmla z24.s, z6.s, z4.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "fmla z21.s, z7.s, z3.s[1]\n" + "fmla z25.s, z7.s, z4.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "fmla z22.s, z6.s, z3.s[1]\n" + "fmla z26.s, z6.s, z4.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z23.s, z7.s, z3.s[1]\n" + "fmla z27.s, z7.s, z4.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "fmla z20.s, z6.s, z3.s[2]\n" + "fmla z24.s, z6.s, z4.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "fmla z21.s, z7.s, z3.s[2]\n" + "fmla z25.s, z7.s, z4.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "fmla z22.s, z6.s, z3.s[2]\n" + "fmla z26.s, z6.s, z4.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z23.s, z7.s, z3.s[2]\n" + "fmla z27.s, z7.s, z4.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "fmla z20.s, z6.s, z3.s[3]\n" + "fmla z24.s, z6.s, z4.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "fmla z21.s, z7.s, z3.s[3]\n" + "fmla z25.s, z7.s, z4.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z22.s, z6.s, z3.s[3]\n" + "fmla z26.s, z6.s, z4.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z27.s, z7.s, z4.s[3]\n" + "bgt 66b\n" + "67:" // Height 5: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "add x22, x22, #0x10\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z20.s, z6.s, z3.s[0]\n" + "fmla z24.s, z6.s, z4.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z21.s, z7.s, z3.s[0]\n" + "fmla z25.s, z7.s, z4.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "fmla z22.s, z6.s, z3.s[0]\n" + "fmla z26.s, z6.s, z4.s[0]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z23.s, z7.s, z3.s[0]\n" + "fmla z27.s, z7.s, z4.s[0]\n" + "ble 68f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "fmla z20.s, z6.s, z3.s[1]\n" + "fmla z24.s, z6.s, z4.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "fmla z21.s, z7.s, z3.s[1]\n" + "fmla z25.s, z7.s, z4.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "fmla z22.s, z6.s, z3.s[1]\n" + "fmla z26.s, z6.s, z4.s[1]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z23.s, z7.s, z3.s[1]\n" + "fmla z27.s, z7.s, z4.s[1]\n" + "ble 68f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "fmla z20.s, z6.s, z3.s[2]\n" + "fmla z24.s, z6.s, z4.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "fmla z21.s, z7.s, z3.s[2]\n" + "fmla z25.s, z7.s, z4.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "fmla z22.s, z6.s, z3.s[2]\n" + "fmla z26.s, z6.s, z4.s[2]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z23.s, z7.s, z3.s[2]\n" + "fmla z27.s, z7.s, z4.s[2]\n" + "ble 68f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "fmla z20.s, z6.s, z3.s[3]\n" + "fmla z24.s, z6.s, z4.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "fmla z21.s, z7.s, z3.s[3]\n" + "fmla z25.s, z7.s, z4.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z22.s, z6.s, z3.s[3]\n" + "fmla z26.s, z6.s, z4.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z27.s, z7.s, z4.s[3]\n" + "68:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 63b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 69f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z23.s, p5/M, z23.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z23.s, p5/M, z23.s, z1.s\n" + "fmax z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z1.s\n" + "fmax z26.s, p5/M, z26.s, z1.s\n" + "fmax z27.s, p5/M, z27.s, z1.s\n" + "69:" // Height 5: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "70:" // Height 5: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 59b\n" + "b 86f\n" + "71:" // Height 6 + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "mov x14, %x[bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 72f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 73f\n" + "72:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "73:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x16\n" + "incw x19\n" + "whilelt p3.s, x19, x16\n" + "incw x19\n" + "whilelt p2.s, x19, x16\n" + "incw x19\n" + "whilelt p1.s, x19, x16\n" + "cbz x14, 74f\n" + "ld1w { z8.s }, p5/Z, [x14]\n" + "mov z12.d, z8.d\n" + "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "mov z16.d, z8.d\n" + "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "mov z20.d, z8.d\n" + "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "mov z13.d, z9.d\n" + "mov z17.d, z9.d\n" + "mov z14.d, z10.d\n" + "mov z15.d, z11.d\n" + "mov z18.d, z10.d\n" + "mov z19.d, z11.d\n" + "mov z21.d, z9.d\n" + "mov z22.d, z10.d\n" + "mov z23.d, z11.d\n" + "mov z24.d, z8.d\n" + "mov z25.d, z9.d\n" + "mov z26.d, z10.d\n" + "mov z27.d, z11.d\n" + "mov z28.d, z8.d\n" + "mov z29.d, z9.d\n" + "mov z30.d, z10.d\n" + "mov z31.d, z11.d\n" + "b 76f\n" + "74:" // Height 6: no bias + "tbz %x[flags], #0, 75f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x21]\n" + "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 76f\n" + "75:" // Height 6: no accumulate + "mov z8.b, #0x0\n" + "mov z9.b, #0x0\n" + "mov z10.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z12.b, #0x0\n" + "mov z13.b, #0x0\n" + "mov z14.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z17.b, #0x0\n" + "mov z18.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z20.b, #0x0\n" + "mov z21.b, #0x0\n" + "mov z22.b, #0x0\n" + "mov z23.b, #0x0\n" + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "76:" // Height 6: setup done + "mov x12, #0x0\n" + "77:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 78f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 79f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 79f\n" + "78:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "add x20, x22, x19, LSL #2\n" + "79:" // Height 6: input setup done + "cmp x11, #0x4\n" + "ble 81f\n" + "80:" // Height 6: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "sub x11, x11, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "fmla z20.s, z6.s, z3.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x20, x20, #0x10\n" + "fmla z24.s, z6.s, z4.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x4\n" + "fmla z28.s, z6.s, z5.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z21.s, z7.s, z3.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z25.s, z7.s, z4.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla z29.s, z7.s, z5.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "fmla z22.s, z6.s, z3.s[0]\n" + "fmla z26.s, z6.s, z4.s[0]\n" + "fmla z30.s, z6.s, z5.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z23.s, z7.s, z3.s[0]\n" + "fmla z27.s, z7.s, z4.s[0]\n" + "fmla z31.s, z7.s, z5.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "fmla z20.s, z6.s, z3.s[1]\n" + "fmla z24.s, z6.s, z4.s[1]\n" + "fmla z28.s, z6.s, z5.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "fmla z21.s, z7.s, z3.s[1]\n" + "fmla z25.s, z7.s, z4.s[1]\n" + "fmla z29.s, z7.s, z5.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" + "addvl x15, x15, #16\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "fmla z22.s, z6.s, z3.s[1]\n" + "fmla z26.s, z6.s, z4.s[1]\n" + "fmla z30.s, z6.s, z5.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z23.s, z7.s, z3.s[1]\n" + "fmla z27.s, z7.s, z4.s[1]\n" + "fmla z31.s, z7.s, z5.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "fmla z20.s, z6.s, z3.s[2]\n" + "fmla z24.s, z6.s, z4.s[2]\n" + "fmla z28.s, z6.s, z5.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "fmla z21.s, z7.s, z3.s[2]\n" + "fmla z25.s, z7.s, z4.s[2]\n" + "fmla z29.s, z7.s, z5.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "fmla z22.s, z6.s, z3.s[2]\n" + "fmla z26.s, z6.s, z4.s[2]\n" + "fmla z30.s, z6.s, z5.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z23.s, z7.s, z3.s[2]\n" + "fmla z27.s, z7.s, z4.s[2]\n" + "fmla z31.s, z7.s, z5.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "fmla z20.s, z6.s, z3.s[3]\n" + "fmla z24.s, z6.s, z4.s[3]\n" + "fmla z28.s, z6.s, z5.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "fmla z21.s, z7.s, z3.s[3]\n" + "fmla z25.s, z7.s, z4.s[3]\n" + "fmla z29.s, z7.s, z5.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z22.s, z6.s, z3.s[3]\n" + "fmla z26.s, z6.s, z4.s[3]\n" + "fmla z30.s, z6.s, z5.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z27.s, z7.s, z4.s[3]\n" + "fmla z31.s, z7.s, z5.s[3]\n" + "bgt 80b\n" + "81:" // Height 6: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x15]\n" + "whilelt p0.s, XZR, x11\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x10]\n" + "fmla z8.s, z6.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z9.s, z7.s, z0.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z12.s, z6.s, z1.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z16.s, z6.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "fmla z20.s, z6.s, z3.s[0]\n" + "add x20, x20, #0x10\n" + "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z24.s, z6.s, z4.s[0]\n" + "fmla z28.s, z6.s, z5.s[0]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z21.s, z7.s, z3.s[0]\n" + "fmla z25.s, z7.s, z4.s[0]\n" + "fmla z29.s, z7.s, z5.s[0]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[0]\n" + "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z18.s, z6.s, z2.s[0]\n" + "fmla z22.s, z6.s, z3.s[0]\n" + "fmla z26.s, z6.s, z4.s[0]\n" + "fmla z30.s, z6.s, z5.s[0]\n" + "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z23.s, z7.s, z3.s[0]\n" + "fmla z27.s, z7.s, z4.s[0]\n" + "fmla z31.s, z7.s, z5.s[0]\n" + "ble 82f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[1]\n" + "fmla z16.s, z6.s, z2.s[1]\n" + "fmla z20.s, z6.s, z3.s[1]\n" + "fmla z24.s, z6.s, z4.s[1]\n" + "fmla z28.s, z6.s, z5.s[1]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[1]\n" + "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z17.s, z7.s, z2.s[1]\n" + "fmla z21.s, z7.s, z3.s[1]\n" + "fmla z25.s, z7.s, z4.s[1]\n" + "fmla z29.s, z7.s, z5.s[1]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[1]\n" + "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z18.s, z6.s, z2.s[1]\n" + "fmla z22.s, z6.s, z3.s[1]\n" + "fmla z26.s, z6.s, z4.s[1]\n" + "fmla z30.s, z6.s, z5.s[1]\n" + "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z23.s, z7.s, z3.s[1]\n" + "fmla z27.s, z7.s, z4.s[1]\n" + "fmla z31.s, z7.s, z5.s[1]\n" + "ble 82f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "subs x11, x11, #0x1\n" + "fmla z12.s, z6.s, z1.s[2]\n" + "fmla z16.s, z6.s, z2.s[2]\n" + "fmla z20.s, z6.s, z3.s[2]\n" + "fmla z24.s, z6.s, z4.s[2]\n" + "fmla z28.s, z6.s, z5.s[2]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[2]\n" + "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z17.s, z7.s, z2.s[2]\n" + "fmla z21.s, z7.s, z3.s[2]\n" + "fmla z25.s, z7.s, z4.s[2]\n" + "fmla z29.s, z7.s, z5.s[2]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[2]\n" + "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z18.s, z6.s, z2.s[2]\n" + "fmla z22.s, z6.s, z3.s[2]\n" + "fmla z26.s, z6.s, z4.s[2]\n" + "fmla z30.s, z6.s, z5.s[2]\n" + "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z23.s, z7.s, z3.s[2]\n" + "fmla z27.s, z7.s, z4.s[2]\n" + "fmla z31.s, z7.s, z5.s[2]\n" + "ble 82f\n" + "ld1w { z6.s }, p5/Z, [x15]\n" + "fmla z8.s, z6.s, z0.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "fmla z12.s, z6.s, z1.s[3]\n" + "fmla z16.s, z6.s, z2.s[3]\n" + "fmla z20.s, z6.s, z3.s[3]\n" + "fmla z24.s, z6.s, z4.s[3]\n" + "fmla z28.s, z6.s, z5.s[3]\n" + "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "fmla z9.s, z7.s, z0.s[3]\n" + "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z17.s, z7.s, z2.s[3]\n" + "fmla z21.s, z7.s, z3.s[3]\n" + "fmla z25.s, z7.s, z4.s[3]\n" + "fmla z29.s, z7.s, z5.s[3]\n" + "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + "fmla z10.s, z6.s, z0.s[3]\n" + "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z18.s, z6.s, z2.s[3]\n" + "fmla z22.s, z6.s, z3.s[3]\n" + "fmla z26.s, z6.s, z4.s[3]\n" + "fmla z30.s, z6.s, z5.s[3]\n" + "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z27.s, z7.s, z4.s[3]\n" + "fmla z31.s, z7.s, z5.s[3]\n" + "82:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 77b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 83f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z1.s }, p5/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z0.s }, p5/Z, [x19]\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmax z8.s, p5/M, z8.s, z1.s\n" + "fmax z9.s, p5/M, z9.s, z1.s\n" + "fmax z10.s, p5/M, z10.s, z1.s\n" + "fmax z11.s, p5/M, z11.s, z1.s\n" + "fmax z12.s, p5/M, z12.s, z1.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmax z13.s, p5/M, z13.s, z1.s\n" + "fmax z14.s, p5/M, z14.s, z1.s\n" + "fmax z15.s, p5/M, z15.s, z1.s\n" + "fmax z16.s, p5/M, z16.s, z1.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmax z17.s, p5/M, z17.s, z1.s\n" + "fmax z18.s, p5/M, z18.s, z1.s\n" + "fmax z19.s, p5/M, z19.s, z1.s\n" + "fmax z20.s, p5/M, z20.s, z1.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z23.s, p5/M, z23.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmax z21.s, p5/M, z21.s, z1.s\n" + "fmax z22.s, p5/M, z22.s, z1.s\n" + "fmax z23.s, p5/M, z23.s, z1.s\n" + "fmax z24.s, p5/M, z24.s, z1.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmin z28.s, p5/M, z28.s, z0.s\n" + "fmax z25.s, p5/M, z25.s, z1.s\n" + "fmax z26.s, p5/M, z26.s, z1.s\n" + "fmax z27.s, p5/M, z27.s, z1.s\n" + "fmax z28.s, p5/M, z28.s, z1.s\n" + "fmin z29.s, p5/M, z29.s, z0.s\n" + "fmin z30.s, p5/M, z30.s, z0.s\n" + "fmin z31.s, p5/M, z31.s, z0.s\n" + "fmax z29.s, p5/M, z29.s, z1.s\n" + "fmax z30.s, p5/M, z30.s, z1.s\n" + "fmax z31.s, p5/M, z31.s, z1.s\n" + "83:" // Height 6: No activation + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "st1w { z28.s }, p4, [x21]\n" + "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" + "addvl x21, x21, #4\n" + "84:" // Height 6: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x16, x16, x19\n" + "bgt 73b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 86f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 85f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "85:" // Update direct input + "mov x19, #0x18\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "86:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp new file mode 100644 index 0000000000..20d9922e93 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const float *, \ + IndirectOutputArg, \ + const float *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_fp32_mla_8x1VL( ARGLIST ); + +class cls_sve_hybrid_fp32_mla_8x1VL +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return get_vector_length() * 1; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32_mla_8x1VL; + + cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp new file mode 100644 index 0000000000..361e303c7a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp @@ -0,0 +1,1751 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_fp32_mla_8x1VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "1:" // Row loop + "cmp %x[M], #0x8\n" + "bge 99f\n" + "cmp %x[M], #0x6\n" + "bgt 85f\n" + "beq 71f\n" + "cmp %x[M], #0x4\n" + "bgt 57f\n" + "beq 43f\n" + "cmp %x[M], #0x2\n" + "bgt 29f\n" + "beq 15f\n" + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x8, %x[bias]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x17, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 4f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "addvl x8, x8, #1\n" + "b 6f\n" + "4:" // Height 1: no bias + "tbz %x[flags], #0, 5f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "b 6f\n" + "5:" // Height 1: no accumulate + "mov z24.b, #0x0\n" + "6:" // Height 1: setup done + "mov x16, #0x0\n" + "7:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 8f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "cbnz x16, 9f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "b 9f\n" + "8:" // Height 1: setup direct input + "mov x14, %x[input_ptr]\n" + "9:" // Height 1: input setup done + "cmp x15, #0x4\n" + "ble 11f\n" + "10:" // Height 1: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x14, x14, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "bgt 10b\n" + "11:" // Height 1: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "add x14, x14, #0x10\n" + "addvl x7, x7, #1\n" + "ble 12f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "addvl x7, x7, #1\n" + "ble 12f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "addvl x7, x7, #1\n" + "ble 12f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "12:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 7b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "tbz %x[flags], #1, 13f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "13:" // Height 1: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "14:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 3b\n" + "b 114f\n" + "15:" // Height 2 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 16f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "add x13, x13, x19, LSL #2\n" + "b 17f\n" + "16:" // Height 2: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "17:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 18f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "mov z25.d, z24.d\n" + "addvl x8, x8, #1\n" + "b 20f\n" + "18:" // Height 2: no bias + "tbz %x[flags], #0, 19f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "ld1w { z25.s }, p1/Z, [x13]\n" + "b 20f\n" + "19:" // Height 2: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "20:" // Height 2: setup done + "mov x16, #0x0\n" + "21:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 22f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "cbnz x16, 23f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "b 23f\n" + "22:" // Height 2: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "23:" // Height 2: input setup done + "cmp x15, #0x4\n" + "ble 25f\n" + "24:" // Height 2: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x12, x12, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "bgt 24b\n" + "25:" // Height 2: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z12.s, z1.s[0]\n" + "add x12, x12, #0x10\n" + "addvl x7, x7, #1\n" + "ble 26f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z13.s, z1.s[1]\n" + "addvl x7, x7, #1\n" + "ble 26f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z14.s, z1.s[2]\n" + "addvl x7, x7, #1\n" + "ble 26f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "fmla z25.s, z15.s, z1.s[3]\n" + "26:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "prfm pldl1keep, [x12, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 21b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "tbz %x[flags], #1, 27f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "27:" // Height 2: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "st1w { z25.s }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "28:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 17b\n" + "b 114f\n" + "29:" // Height 3 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 30f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "add x11, x11, x19, LSL #2\n" + "b 31f\n" + "30:" // Height 3: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "31:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 32f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "mov z25.d, z24.d\n" + "addvl x8, x8, #1\n" + "mov z26.d, z24.d\n" + "b 34f\n" + "32:" // Height 3: no bias + "tbz %x[flags], #0, 33f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "ld1w { z25.s }, p1/Z, [x13]\n" + "ld1w { z26.s }, p1/Z, [x11]\n" + "b 34f\n" + "33:" // Height 3: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "34:" // Height 3: setup done + "mov x16, #0x0\n" + "35:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "cbnz x16, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "b 37f\n" + "36:" // Height 3: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "37:" // Height 3: input setup done + "cmp x15, #0x4\n" + "ble 39f\n" + "38:" // Height 3: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "bgt 38b\n" + "39:" // Height 3: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z12.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z26.s, z12.s, z2.s[0]\n" + "add x10, x10, #0x10\n" + "addvl x7, x7, #1\n" + "ble 40f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z13.s, z1.s[1]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z13.s, z2.s[1]\n" + "ble 40f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z14.s, z1.s[2]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z14.s, z2.s[2]\n" + "ble 40f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "fmla z25.s, z15.s, z1.s[3]\n" + "fmla z26.s, z15.s, z2.s[3]\n" + "40:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 35b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "tbz %x[flags], #1, 41f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "41:" // Height 3: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "st1w { z25.s }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "st1w { z26.s }, p1, [x11]\n" + "addvl x11, x11, #1\n" + "42:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 31b\n" + "b 114f\n" + "43:" // Height 4 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 44f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "b 45f\n" + "44:" // Height 4: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "45:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 46f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "mov z25.d, z24.d\n" + "addvl x8, x8, #1\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "b 48f\n" + "46:" // Height 4: no bias + "tbz %x[flags], #0, 47f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "ld1w { z25.s }, p1/Z, [x13]\n" + "ld1w { z26.s }, p1/Z, [x11]\n" + "ld1w { z27.s }, p1/Z, [x9]\n" + "b 48f\n" + "47:" // Height 4: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "48:" // Height 4: setup done + "mov x16, #0x0\n" + "49:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "cbnz x16, 51f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "b 51f\n" + "50:" // Height 4: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "51:" // Height 4: input setup done + "cmp x15, #0x4\n" + "ble 53f\n" + "52:" // Height 4: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x28, x28, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "bgt 52b\n" + "53:" // Height 4: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z12.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z26.s, z12.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z27.s, z12.s, z3.s[0]\n" + "add x28, x28, #0x10\n" + "addvl x7, x7, #1\n" + "ble 54f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z13.s, z1.s[1]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z13.s, z2.s[1]\n" + "fmla z27.s, z13.s, z3.s[1]\n" + "ble 54f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z14.s, z1.s[2]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z14.s, z2.s[2]\n" + "fmla z27.s, z14.s, z3.s[2]\n" + "ble 54f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "fmla z25.s, z15.s, z1.s[3]\n" + "fmla z26.s, z15.s, z2.s[3]\n" + "fmla z27.s, z15.s, z3.s[3]\n" + "54:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 49b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 55f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "55:" // Height 4: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "st1w { z25.s }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "st1w { z26.s }, p1, [x11]\n" + "addvl x11, x11, #1\n" + "st1w { z27.s }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "56:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 45b\n" + "b 114f\n" + "57:" // Height 5 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 58f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 59f\n" + "58:" // Height 5: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "59:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 60f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "mov z25.d, z24.d\n" + "addvl x8, x8, #1\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "b 62f\n" + "60:" // Height 5: no bias + "tbz %x[flags], #0, 61f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "ld1w { z25.s }, p1/Z, [x13]\n" + "ld1w { z26.s }, p1/Z, [x11]\n" + "ld1w { z27.s }, p1/Z, [x9]\n" + "ld1w { z28.s }, p1/Z, [x27]\n" + "b 62f\n" + "61:" // Height 5: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "62:" // Height 5: setup done + "mov x16, #0x0\n" + "63:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 64f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "cbnz x16, 65f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 65f\n" + "64:" // Height 5: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "65:" // Height 5: input setup done + "cmp x15, #0x4\n" + "ble 67f\n" + "66:" // Height 5: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x26, x26, #0x10\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "bgt 66b\n" + "67:" // Height 5: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z12.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z26.s, z12.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z27.s, z12.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z28.s, z12.s, z4.s[0]\n" + "add x26, x26, #0x10\n" + "addvl x7, x7, #1\n" + "ble 68f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z13.s, z1.s[1]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z13.s, z2.s[1]\n" + "fmla z27.s, z13.s, z3.s[1]\n" + "fmla z28.s, z13.s, z4.s[1]\n" + "ble 68f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z14.s, z1.s[2]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z14.s, z2.s[2]\n" + "fmla z27.s, z14.s, z3.s[2]\n" + "fmla z28.s, z14.s, z4.s[2]\n" + "ble 68f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "fmla z25.s, z15.s, z1.s[3]\n" + "fmla z26.s, z15.s, z2.s[3]\n" + "fmla z27.s, z15.s, z3.s[3]\n" + "fmla z28.s, z15.s, z4.s[3]\n" + "68:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 63b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 69f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "69:" // Height 5: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "st1w { z25.s }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "st1w { z26.s }, p1, [x11]\n" + "addvl x11, x11, #1\n" + "st1w { z27.s }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "st1w { z28.s }, p1, [x27]\n" + "addvl x27, x27, #1\n" + "70:" // Height 5: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 59b\n" + "b 114f\n" + "71:" // Height 6 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 72f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x28]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 73f\n" + "72:" // Height 6: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "73:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 74f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "mov z25.d, z24.d\n" + "addvl x8, x8, #1\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "b 76f\n" + "74:" // Height 6: no bias + "tbz %x[flags], #0, 75f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "ld1w { z25.s }, p1/Z, [x13]\n" + "ld1w { z26.s }, p1/Z, [x11]\n" + "ld1w { z27.s }, p1/Z, [x9]\n" + "ld1w { z28.s }, p1/Z, [x27]\n" + "ld1w { z29.s }, p1/Z, [x25]\n" + "b 76f\n" + "75:" // Height 6: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "76:" // Height 6: setup done + "mov x16, #0x0\n" + "77:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 78f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "ldr x24, [x20, #0x28]\n" + "cbnz x16, 79f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 79f\n" + "78:" // Height 6: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "79:" // Height 6: input setup done + "cmp x15, #0x4\n" + "ble 81f\n" + "80:" // Height 6: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x24, x24, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z29.s, z9.s, z5.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z29.s, z10.s, z5.s[2]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z29.s, z11.s, z5.s[3]\n" + "bgt 80b\n" + "81:" // Height 6: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z12.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z26.s, z12.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z27.s, z12.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z28.s, z12.s, z4.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z29.s, z12.s, z5.s[0]\n" + "add x24, x24, #0x10\n" + "addvl x7, x7, #1\n" + "ble 82f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z13.s, z1.s[1]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z13.s, z2.s[1]\n" + "fmla z27.s, z13.s, z3.s[1]\n" + "fmla z28.s, z13.s, z4.s[1]\n" + "fmla z29.s, z13.s, z5.s[1]\n" + "ble 82f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z14.s, z1.s[2]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z14.s, z2.s[2]\n" + "fmla z27.s, z14.s, z3.s[2]\n" + "fmla z28.s, z14.s, z4.s[2]\n" + "fmla z29.s, z14.s, z5.s[2]\n" + "ble 82f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "fmla z25.s, z15.s, z1.s[3]\n" + "fmla z26.s, z15.s, z2.s[3]\n" + "fmla z27.s, z15.s, z3.s[3]\n" + "fmla z28.s, z15.s, z4.s[3]\n" + "fmla z29.s, z15.s, z5.s[3]\n" + "82:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 77b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 83f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "fmin z29.s, p2/M, z29.s, z16.s\n" + "fmax z29.s, p2/M, z29.s, z17.s\n" + "83:" // Height 6: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "st1w { z25.s }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "st1w { z26.s }, p1, [x11]\n" + "addvl x11, x11, #1\n" + "st1w { z27.s }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "st1w { z28.s }, p1, [x27]\n" + "addvl x27, x27, #1\n" + "st1w { z29.s }, p1, [x25]\n" + "addvl x25, x25, #1\n" + "84:" // Height 6: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 73b\n" + "b 114f\n" + "85:" // Height 7 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 86f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x28]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x23, [%x[output_ptr], #0x30]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 87f\n" + "86:" // Height 7: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "87:" // Height 7: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 88f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "mov z25.d, z24.d\n" + "addvl x8, x8, #1\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "b 90f\n" + "88:" // Height 7: no bias + "tbz %x[flags], #0, 89f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "ld1w { z25.s }, p1/Z, [x13]\n" + "ld1w { z26.s }, p1/Z, [x11]\n" + "ld1w { z27.s }, p1/Z, [x9]\n" + "ld1w { z28.s }, p1/Z, [x27]\n" + "ld1w { z29.s }, p1/Z, [x25]\n" + "ld1w { z30.s }, p1/Z, [x23]\n" + "b 90f\n" + "89:" // Height 7: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "90:" // Height 7: setup done + "mov x16, #0x0\n" + "91:" // Height 7: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 92f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "ldr x24, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "cbnz x16, 93f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 93f\n" + "92:" // Height 7: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "93:" // Height 7: input setup done + "cmp x15, #0x4\n" + "ble 95f\n" + "94:" // Height 7: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "ld1rqw { z6.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z30.s, z8.s, z6.s[0]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z29.s, z9.s, z5.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z30.s, z9.s, z6.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z29.s, z10.s, z5.s[2]\n" + "fmla z30.s, z10.s, z6.s[2]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z29.s, z11.s, z5.s[3]\n" + "fmla z30.s, z11.s, z6.s[3]\n" + "bgt 94b\n" + "95:" // Height 7: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z12.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z26.s, z12.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z27.s, z12.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z28.s, z12.s, z4.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z29.s, z12.s, z5.s[0]\n" + "ld1rqw { z6.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z30.s, z12.s, z6.s[0]\n" + "add x22, x22, #0x10\n" + "addvl x7, x7, #1\n" + "ble 96f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z13.s, z1.s[1]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z13.s, z2.s[1]\n" + "fmla z27.s, z13.s, z3.s[1]\n" + "fmla z28.s, z13.s, z4.s[1]\n" + "fmla z29.s, z13.s, z5.s[1]\n" + "fmla z30.s, z13.s, z6.s[1]\n" + "ble 96f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z14.s, z1.s[2]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z14.s, z2.s[2]\n" + "fmla z27.s, z14.s, z3.s[2]\n" + "fmla z28.s, z14.s, z4.s[2]\n" + "fmla z29.s, z14.s, z5.s[2]\n" + "fmla z30.s, z14.s, z6.s[2]\n" + "ble 96f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "fmla z25.s, z15.s, z1.s[3]\n" + "fmla z26.s, z15.s, z2.s[3]\n" + "fmla z27.s, z15.s, z3.s[3]\n" + "fmla z28.s, z15.s, z4.s[3]\n" + "fmla z29.s, z15.s, z5.s[3]\n" + "fmla z30.s, z15.s, z6.s[3]\n" + "96:" // Height 7: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 91b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 97f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "fmin z29.s, p2/M, z29.s, z16.s\n" + "fmin z30.s, p2/M, z30.s, z16.s\n" + "fmax z29.s, p2/M, z29.s, z17.s\n" + "fmax z30.s, p2/M, z30.s, z17.s\n" + "97:" // Height 7: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "st1w { z25.s }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "st1w { z26.s }, p1, [x11]\n" + "addvl x11, x11, #1\n" + "st1w { z27.s }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "st1w { z28.s }, p1, [x27]\n" + "addvl x27, x27, #1\n" + "st1w { z29.s }, p1, [x25]\n" + "addvl x25, x25, #1\n" + "st1w { z30.s }, p1, [x23]\n" + "addvl x23, x23, #1\n" + "98:" // Height 7: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 87b\n" + "b 114f\n" + "99:" // Height 8 + "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" + "mov x8, %x[bias]\n" + "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 100f\n" + "ldr x17, [%x[output_ptr], #0x0]\n" + "add x17, x17, x19, LSL #2\n" + "ldr x13, [%x[output_ptr], #0x8]\n" + "ldr x11, [%x[output_ptr], #0x10]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x18]\n" + "ldr x27, [%x[output_ptr], #0x20]\n" + "add x11, x11, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x28]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x23, [%x[output_ptr], #0x30]\n" + "ldr x21, [%x[output_ptr], #0x38]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add %x[output_ptr], %x[output_ptr], #0x40\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 101f\n" + "100:" // Height 8: setup direct output + "mov x17, %x[output_ptr]\n" + "add x13, x17, x19, LSL #2\n" + "add x11, x13, x19, LSL #2\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "101:" // Height 8: Column loop + "mov x19, #0x0\n" + "whilelt p1.s, x19, x6\n" + "cbz x8, 102f\n" + "ld1w { z24.s }, p2/Z, [x8]\n" + "mov z25.d, z24.d\n" + "addvl x8, x8, #1\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "b 104f\n" + "102:" // Height 8: no bias + "tbz %x[flags], #0, 103f\n" + "ld1w { z24.s }, p1/Z, [x17]\n" + "ld1w { z25.s }, p1/Z, [x13]\n" + "ld1w { z26.s }, p1/Z, [x11]\n" + "ld1w { z27.s }, p1/Z, [x9]\n" + "ld1w { z28.s }, p1/Z, [x27]\n" + "ld1w { z29.s }, p1/Z, [x25]\n" + "ld1w { z30.s }, p1/Z, [x23]\n" + "ld1w { z31.s }, p1/Z, [x21]\n" + "b 104f\n" + "103:" // Height 8: no accumulate + "mov z24.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z28.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z30.b, #0x0\n" + "mov z31.b, #0x0\n" + "104:" // Height 8: setup done + "mov x16, #0x0\n" + "105:" // Height 8: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w15, [x20, x16, LSL #0x2]\n" + "tbz %x[flags], #3, 106f\n" + "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x14, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" + "ldr x26, [x20, #0x20]\n" + "ldr x24, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "ldr x20, [x20, #0x38]\n" + "cbnz x16, 107f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x14, x14, x19, LSL #2\n" + "add x12, x12, x19, LSL #2\n" + "add x10, x10, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 107f\n" + "106:" // Height 8: setup direct input + "mov x14, %x[input_ptr]\n" + "add x12, x14, x19, LSL #2\n" + "add x10, x12, x19, LSL #2\n" + "add x28, x10, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "add x20, x22, x19, LSL #2\n" + "107:" // Height 8: input setup done + "cmp x15, #0x4\n" + "ble 109f\n" + "108:" // Height 8: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" + "sub x15, x15, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "ld1rqw { z6.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "ld1rqw { z7.s }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "fmla z30.s, z8.s, z6.s[0]\n" + "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" + "cmp x15, #0x4\n" + "fmla z31.s, z8.s, z7.s[0]\n" + "prfm pldl1keep, [x14, #0x80]\n" + "addvl x7, x7, #4\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla z29.s, z9.s, z5.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla z30.s, z9.s, z6.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla z31.s, z9.s, z7.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z29.s, z10.s, z5.s[2]\n" + "fmla z30.s, z10.s, z6.s[2]\n" + "fmla z31.s, z10.s, z7.s[2]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z29.s, z11.s, z5.s[3]\n" + "fmla z30.s, z11.s, z6.s[3]\n" + "fmla z31.s, z11.s, z7.s[3]\n" + "bgt 108b\n" + "109:" // Height 8: Multiply loop: Single iteration only + "ld1w { z12.s }, p2/Z, [x7]\n" + "whilelt p0.s, XZR, x15\n" + "subs x15, x15, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x14]\n" + "fmla z24.s, z12.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x12]\n" + "add x14, x14, #0x10\n" + "fmla z25.s, z12.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x10]\n" + "add x12, x12, #0x10\n" + "fmla z26.s, z12.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "fmla z27.s, z12.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "fmla z28.s, z12.s, z4.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "fmla z29.s, z12.s, z5.s[0]\n" + "ld1rqw { z6.s }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "fmla z30.s, z12.s, z6.s[0]\n" + "ld1rqw { z7.s }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "fmla z31.s, z12.s, z7.s[0]\n" + "add x20, x20, #0x10\n" + "addvl x7, x7, #1\n" + "ble 110f\n" + "ld1w { z13.s }, p2/Z, [x7]\n" + "fmla z24.s, z13.s, z0.s[1]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z13.s, z1.s[1]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z13.s, z2.s[1]\n" + "fmla z27.s, z13.s, z3.s[1]\n" + "fmla z28.s, z13.s, z4.s[1]\n" + "fmla z29.s, z13.s, z5.s[1]\n" + "fmla z30.s, z13.s, z6.s[1]\n" + "fmla z31.s, z13.s, z7.s[1]\n" + "ble 110f\n" + "ld1w { z14.s }, p2/Z, [x7]\n" + "fmla z24.s, z14.s, z0.s[2]\n" + "subs x15, x15, #0x1\n" + "fmla z25.s, z14.s, z1.s[2]\n" + "addvl x7, x7, #1\n" + "fmla z26.s, z14.s, z2.s[2]\n" + "fmla z27.s, z14.s, z3.s[2]\n" + "fmla z28.s, z14.s, z4.s[2]\n" + "fmla z29.s, z14.s, z5.s[2]\n" + "fmla z30.s, z14.s, z6.s[2]\n" + "fmla z31.s, z14.s, z7.s[2]\n" + "ble 110f\n" + "ld1w { z15.s }, p2/Z, [x7]\n" + "fmla z24.s, z15.s, z0.s[3]\n" + "addvl x7, x7, #1\n" + "fmla z25.s, z15.s, z1.s[3]\n" + "fmla z26.s, z15.s, z2.s[3]\n" + "fmla z27.s, z15.s, z3.s[3]\n" + "fmla z28.s, z15.s, z4.s[3]\n" + "fmla z29.s, z15.s, z5.s[3]\n" + "fmla z30.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "110:" // Height 8: Multiply loop: multiply skip + "prfm pldl1keep, [x14, #0x80]\n" + "add x16, x16, #0x1\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x16, x19\n" + "bne 105b\n" + "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 111f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1rw { z17.s }, p2/Z, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1rw { z16.s }, p2/Z, [x19]\n" + "fmin z24.s, p2/M, z24.s, z16.s\n" + "fmin z25.s, p2/M, z25.s, z16.s\n" + "fmin z26.s, p2/M, z26.s, z16.s\n" + "fmin z27.s, p2/M, z27.s, z16.s\n" + "fmin z28.s, p2/M, z28.s, z16.s\n" + "fmax z24.s, p2/M, z24.s, z17.s\n" + "fmax z25.s, p2/M, z25.s, z17.s\n" + "fmax z26.s, p2/M, z26.s, z17.s\n" + "fmax z27.s, p2/M, z27.s, z17.s\n" + "fmax z28.s, p2/M, z28.s, z17.s\n" + "fmin z29.s, p2/M, z29.s, z16.s\n" + "fmin z30.s, p2/M, z30.s, z16.s\n" + "fmin z31.s, p2/M, z31.s, z16.s\n" + "fmax z29.s, p2/M, z29.s, z17.s\n" + "fmax z30.s, p2/M, z30.s, z17.s\n" + "fmax z31.s, p2/M, z31.s, z17.s\n" + "111:" // Height 8: No activation + "st1w { z24.s }, p1, [x17]\n" + "addvl x17, x17, #1\n" + "st1w { z25.s }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "st1w { z26.s }, p1, [x11]\n" + "addvl x11, x11, #1\n" + "st1w { z27.s }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "st1w { z28.s }, p1, [x27]\n" + "addvl x27, x27, #1\n" + "st1w { z29.s }, p1, [x25]\n" + "addvl x25, x25, #1\n" + "st1w { z30.s }, p1, [x23]\n" + "addvl x23, x23, #1\n" + "st1w { z31.s }, p1, [x21]\n" + "addvl x21, x21, #1\n" + "112:" // Height 8: Writeback done + "mov x19, #0x0\n" + "incw x19\n" + "subs x6, x6, x19\n" + "bgt 101b\n" + "subs %x[M], %x[M], #0x8\n" + "beq 114f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 113f\n" + "add x20, x20, #0x8\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "113:" // Update direct input + "mov x19, #0x20\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "114:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp deleted file mode 100644 index fd416ed2f4..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - - -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); - -class hybrid_fp32_mmla_4VLx4 -{ -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return get_vector_length() * 2; - } - - static constexpr unsigned int k_unroll() - { - return 2; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_fp32_mmla_4VLx4; - - hybrid_fp32_mmla_4VLx4(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp deleted file mode 100644 index 1364585604..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp +++ /dev/null @@ -1,3459 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) { - const int K_stride = ((K + 1) / 2) * 2; - const long loops_count = ((K + 4) / 8) - 1; - K -= loops_count * 8; - const long regs_count = (K / 4) - 1; - K -= (regs_count + 1) * 4; - const long leftovers = K; - const long blocks_count = (K + 1) / 2; - float nullbias[128]; - if (!accumulate && !bias) { - memset(nullbias, 0, (2 * get_vector_length() * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - int rows_to_compute; - - for (int y=0; y 8) { - if (rows_to_compute % 8) { - rows_to_compute = 8 - 1; - } else { - rows_to_compute = 8; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (2 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = blocks_count; - const float *a_ptr0 = a_ptr0_base; - const float *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(float); - const float *biasptr = bias ? bias+x0 : nullbias; - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z1.s, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip1 z18.s, z15.s, z15.s\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z14.s, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "mov z1.s, #0\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "mov z14.s, #0\n" - "zip1 z18.s, z13.s, z14.s\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z5.s, #0\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "mov z1.s, #0\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z5.s, #0\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "mov z1.s, #0\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z5.s, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp1 z1.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip1 z18.s, z15.s, z15.s\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z18.s, z13.s, z14.s\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "subs %[loops], %[loops], #0x1\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "trn1 z0.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "addvl a_ptr1, a_ptr1, #1\n" - "trn1 z0.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "st1w z1.s, p0, [c_ptr1]\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z3.s, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z20.d, z16.d\n" - "mov z21.d, z17.d\n" - "mov z22.d, z18.d\n" - "mov z23.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z3.s, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z18.s, z13.s, z14.s\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "mov z14.s, #0\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "add a_ptr1, a_ptr1, #0x20\n" - "trn2 z8.d, z4.d, z5.d\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z7.s, #0\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "trn2 z9.d, z6.d, z7.d\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z3.s, #0\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn2 z8.d, z4.d, z5.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z7.s, #0\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "trn2 z9.d, z6.d, z7.d\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z3.s, #0\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z7.s, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp1 z5.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z21.d, z17.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z22.d, z18.d\n" - "mov z23.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "trn2 z8.d, z4.d, z5.d\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z1.d, z6.d, z7.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #2\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - case 5: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "c_ptr1 .req X4\n" - "c_ptr2 .req X5\n" - "c_ptr3 .req X6\n" - "c_ptr4 .req X7\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z5.s, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z20.d, z16.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z21.d, z17.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z22.d, z18.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z23.d, z19.d\n" - "mov z24.d, z16.d\n" - "mov z25.d, z17.d\n" - "mov z26.d, z18.d\n" - "mov z27.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z5.s, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "zip1 z20.s, z13.s, z14.s\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "mov z14.s, #0\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z9.s, #0\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "trn2 z10.d, z8.d, z9.d\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z5.s, #0\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z9.s, #0\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "trn2 z10.d, z8.d, z9.d\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z5.s, #0\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "trn1 z10.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p6/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z9.s, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - "cbz %[blocks], 5f\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "uzp1 z8.s, z24.s, z25.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp1 z9.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" - ); - break; - case 6: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "c_ptr1 .req X5\n" - "c_ptr2 .req X6\n" - "c_ptr3 .req X7\n" - "c_ptr4 .req X8\n" - "c_ptr5 .req X9\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z20.d, z16.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z21.d, z17.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z22.d, z18.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z23.d, z19.d\n" - "add a_ptr5, a_ptr5, #0x10\n" - "mov z24.d, z16.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z25.d, z17.d\n" - "mov z26.d, z18.d\n" - "mov z27.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip1 z20.s, z13.s, z14.s\n" - "add a_ptr5, a_ptr5, #0x10\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "ld1w z14.s, p0/z, [c_ptr5]\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "subs %[loops], %[loops], #0x1\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p7/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "add a_ptr5, a_ptr5, #0x20\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p7/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "trn2 z10.d, z8.d, z9.d\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z2.d, z8.d, z9.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z9.d, z6.d, z7.d\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "addvl a_ptr5, a_ptr5, #2\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "addvl a_ptr2, a_ptr2, #2\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "trn1 z10.d, z4.d, z5.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p6/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p6/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "addvl a_ptr5, a_ptr5, #1\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - "cbz %[blocks], 5f\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "uzp1 z8.s, z24.s, z25.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp2 z9.s, z24.s, z25.s\n" - "uzp1 z10.s, z26.s, z27.s\n" - "uzp2 z11.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "st1w z9.s, p0, [c_ptr5]\n" - "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" - "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" - ); - break; - case 7: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "c_ptr1 .req X6\n" - "c_ptr2 .req X7\n" - "c_ptr3 .req X8\n" - "c_ptr4 .req X9\n" - "c_ptr5 .req X10\n" - "c_ptr6 .req X11\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z7.s, #0\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "mov z20.d, z16.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z21.d, z17.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z22.d, z18.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z23.d, z19.d\n" - "add a_ptr5, a_ptr5, #0x10\n" - "mov z24.d, z16.d\n" - "add a_ptr6, a_ptr6, #0x10\n" - "mov z25.d, z17.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z26.d, z18.d\n" - "mov z27.d, z19.d\n" - "mov z28.d, z16.d\n" - "mov z29.d, z17.d\n" - "mov z30.d, z18.d\n" - "mov z31.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "mov z7.s, #0\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "zip1 z20.s, z13.s, z14.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "add a_ptr5, a_ptr5, #0x10\n" - "add a_ptr6, a_ptr6, #0x10\n" - "zip1 z22.s, z13.s, z14.s\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "ld1w z14.s, p0/z, [c_ptr5]\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr6]\n" - "mov z14.s, #0\n" - "zip1 z28.s, z13.s, z14.s\n" - "zip2 z29.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" - "mov z14.s, #0\n" - "zip1 z30.s, z13.s, z14.s\n" - "zip2 z31.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p7/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "add a_ptr5, a_ptr5, #0x20\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1rqw z10.s, p7/z, [a_ptr6]\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z11.s, #0\n" - "add a_ptr6, a_ptr6, #0x20\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z7.s, #0\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p7/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1rqw z10.s, p7/z, [a_ptr6]\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z11.s, #0\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "addvl a_ptr5, a_ptr5, #2\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "addvl a_ptr6, a_ptr6, #2\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "mov z7.s, #0\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr3, a_ptr3, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "trn1 z11.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p6/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p6/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "addvl a_ptr5, a_ptr5, #1\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1rqw z10.s, p6/z, [a_ptr6]\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z11.s, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "addvl a_ptr6, a_ptr6, #1\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "trn1 z3.d, z10.d, z11.d\n" - "cbz %[blocks], 5f\n" - "trn2 z11.d, z10.d, z11.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp1 z8.s, z24.s, z25.s\n" - "uzp2 z9.s, z24.s, z25.s\n" - "uzp1 z10.s, z26.s, z27.s\n" - "uzp2 z11.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "st1w z9.s, p0, [c_ptr5]\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "uzp1 z12.s, z28.s, z29.s\n" - "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" - "uzp1 z13.s, z30.s, z31.s\n" - "st1w z12.s, p0, [c_ptr6]\n" - "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory" - ); - break; - default: - case 8: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "whilelt p6.s, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.s\n" - "whilelt p1.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "ld1w z15.s, p0/z, [%[biasptr]]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z15.s, z15.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z15.s, z15.s\n" - "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "zip1 z18.s, z15.s, z15.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "zip2 z19.s, z15.s, z15.s\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z20.d, z16.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z21.d, z17.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z22.d, z18.d\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z23.d, z19.d\n" - "add a_ptr4, a_ptr4, #0x10\n" - "mov z24.d, z16.d\n" - "add a_ptr5, a_ptr5, #0x10\n" - "mov z25.d, z17.d\n" - "add a_ptr6, a_ptr6, #0x10\n" - "mov z26.d, z18.d\n" - "add a_ptr7, a_ptr7, #0x10\n" - "mov z27.d, z19.d\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "mov z28.d, z16.d\n" - "mov z29.d, z17.d\n" - "mov z30.d, z18.d\n" - "mov z31.d, z19.d\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z13.s, p0/z, [%[c_ptr0]]\n" - "ld1w z14.s, p0/z, [c_ptr1]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "zip1 z16.s, z13.s, z14.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "zip2 z17.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "trn1 z8.d, z0.d, z1.d\n" - "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "zip1 z18.s, z13.s, z14.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "zip2 z19.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr2]\n" - "trn1 z9.d, z2.d, z3.d\n" - "ld1w z14.s, p0/z, [c_ptr3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "add a_ptr4, a_ptr4, #0x10\n" - "trn1 z10.d, z4.d, z5.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "zip1 z20.s, z13.s, z14.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "zip2 z21.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "add a_ptr5, a_ptr5, #0x10\n" - "trn1 z11.d, z6.d, z7.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add a_ptr6, a_ptr6, #0x10\n" - "zip1 z22.s, z13.s, z14.s\n" - "add a_ptr7, a_ptr7, #0x10\n" - "zip2 z23.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr4]\n" - "ld1w z14.s, p0/z, [c_ptr5]\n" - "zip1 z24.s, z13.s, z14.s\n" - "zip2 z25.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" - "zip1 z26.s, z13.s, z14.s\n" - "zip2 z27.s, z13.s, z14.s\n" - "ld1w z13.s, p0/z, [c_ptr6]\n" - "ld1w z14.s, p0/z, [c_ptr7]\n" - "zip1 z28.s, z13.s, z14.s\n" - "zip2 z29.s, z13.s, z14.s\n" - "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" - "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n" - "zip1 z30.s, z13.s, z14.s\n" - "zip2 z31.s, z13.s, z14.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "3:\n" - "trn2 z0.d, z0.d, z1.d\n" - "subs %[loops], %[loops], #0x1\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "add a_ptr1, a_ptr1, #0x20\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "add a_ptr2, a_ptr2, #0x20\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "add a_ptr3, a_ptr3, #0x20\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p7/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "add a_ptr4, a_ptr4, #0x20\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "add a_ptr5, a_ptr5, #0x20\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1rqw z10.s, p7/z, [a_ptr6]\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z11.s, p7/z, [a_ptr7]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "add a_ptr6, a_ptr6, #0x20\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "add a_ptr7, a_ptr7, #0x20\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "trn1 z11.d, z6.d, z7.d\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p7/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p7/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1rqw z10.s, p7/z, [a_ptr6]\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z11.s, p7/z, [a_ptr7]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "trn1 z3.d, z10.d, z11.d\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "trn2 z11.d, z10.d, z11.d\n" - "trn2 z10.d, z8.d, z9.d\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "addvl a_ptr4, a_ptr4, #2\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - "addvl a_ptr5, a_ptr5, #2\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - "addvl a_ptr6, a_ptr6, #2\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - "addvl a_ptr1, a_ptr1, #2\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - "addvl a_ptr7, a_ptr7, #2\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "addvl %[b_ptr0], %[b_ptr0], #-4\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl a_ptr2, a_ptr2, #2\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "addvl a_ptr3, a_ptr3, #2\n" - "trn1 z8.d, z0.d, z1.d\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "trn1 z9.d, z2.d, z3.d\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "trn1 z10.d, z4.d, z5.d\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "trn1 z11.d, z6.d, z7.d\n" - "cbz %[blocks], 5f\n" - "trn2 z0.d, z0.d, z1.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z1.d, z2.d, z3.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "b 5f\n" - "4:\n" - "trn2 z0.d, z0.d, z1.d\n" - "trn2 z1.d, z2.d, z3.d\n" - "trn2 z2.d, z4.d, z5.d\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" - "trn2 z3.d, z6.d, z7.d\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - "ld1rqw z8.s, p6/z, [a_ptr4]\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - "addvl a_ptr1, a_ptr1, #1\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - "addvl a_ptr2, a_ptr2, #1\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - "addvl a_ptr3, a_ptr3, #1\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - "ld1rqw z9.s, p6/z, [a_ptr5]\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - "addvl a_ptr4, a_ptr4, #1\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - "addvl a_ptr5, a_ptr5, #1\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - "ld1rqw z10.s, p6/z, [a_ptr6]\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "ld1rqw z11.s, p6/z, [a_ptr7]\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - "addvl a_ptr6, a_ptr6, #1\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - "addvl a_ptr7, a_ptr7, #1\n" - "trn1 z0.d, z4.d, z5.d\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - "trn1 z1.d, z6.d, z7.d\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - "trn1 z2.d, z8.d, z9.d\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "trn1 z3.d, z10.d, z11.d\n" - "cbz %[blocks], 5f\n" - "trn2 z11.d, z10.d, z11.d\n" - "ld1w z12.s, p7/z, [%[b_ptr0]]\n" - "trn2 z10.d, z8.d, z9.d\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "trn2 z9.d, z6.d, z7.d\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "trn2 z8.d, z4.d, z5.d\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" - "subs %[blocks], %[blocks], #0x1\n" - ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" - ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" - ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" - ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" - ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" - ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" - ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" - ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" - ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" - ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" - ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" - ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" - ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" - ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" - ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" - "b.eq 5f\n" - "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" - ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" - ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" - ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" - ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" - ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" - ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" - ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" - ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" - ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" - ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" - ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" - ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" - ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" - ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" - ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" - "5:\n" - "ld1rw z14.s, p7/z, [%[minptr]]\n" - "ld1rw z15.s, p7/z, [%[maxptr]]\n" - "fmax z16.s, p7/m, z16.s, z14.s\n" - "fmax z17.s, p7/m, z17.s, z14.s\n" - "fmax z18.s, p7/m, z18.s, z14.s\n" - "fmax z19.s, p7/m, z19.s, z14.s\n" - "fmin z16.s, p7/m, z16.s, z15.s\n" - "fmin z17.s, p7/m, z17.s, z15.s\n" - "fmin z18.s, p7/m, z18.s, z15.s\n" - "fmin z19.s, p7/m, z19.s, z15.s\n" - "fmax z20.s, p7/m, z20.s, z14.s\n" - "uzp1 z0.s, z16.s, z17.s\n" - "uzp2 z1.s, z16.s, z17.s\n" - "uzp1 z2.s, z18.s, z19.s\n" - "uzp2 z3.s, z18.s, z19.s\n" - "st1w z0.s, p0, [%[c_ptr0]]\n" - "fmin z20.s, p7/m, z20.s, z15.s\n" - "fmax z21.s, p7/m, z21.s, z14.s\n" - "fmax z22.s, p7/m, z22.s, z14.s\n" - "st1w z1.s, p0, [c_ptr1]\n" - "fmax z23.s, p7/m, z23.s, z14.s\n" - "fmax z24.s, p7/m, z24.s, z14.s\n" - "fmin z21.s, p7/m, z21.s, z15.s\n" - "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "fmin z22.s, p7/m, z22.s, z15.s\n" - "addvl %[c_ptr0], %[c_ptr0], #2\n" - "fmin z23.s, p7/m, z23.s, z15.s\n" - "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" - "uzp1 z4.s, z20.s, z21.s\n" - "uzp2 z5.s, z20.s, z21.s\n" - "fmin z24.s, p7/m, z24.s, z15.s\n" - "uzp1 z6.s, z22.s, z23.s\n" - "st1w z4.s, p0, [c_ptr2]\n" - "uzp2 z7.s, z22.s, z23.s\n" - "fmax z25.s, p7/m, z25.s, z14.s\n" - "fmax z26.s, p7/m, z26.s, z14.s\n" - "st1w z5.s, p0, [c_ptr3]\n" - "fmax z27.s, p7/m, z27.s, z14.s\n" - "fmax z28.s, p7/m, z28.s, z14.s\n" - "fmin z25.s, p7/m, z25.s, z15.s\n" - "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" - "fmin z26.s, p7/m, z26.s, z15.s\n" - "fmin z27.s, p7/m, z27.s, z15.s\n" - "fmin z28.s, p7/m, z28.s, z15.s\n" - "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" - "uzp1 z8.s, z24.s, z25.s\n" - "uzp2 z9.s, z24.s, z25.s\n" - "uzp1 z10.s, z26.s, z27.s\n" - "uzp2 z11.s, z26.s, z27.s\n" - "st1w z8.s, p0, [c_ptr4]\n" - "fmax z29.s, p7/m, z29.s, z14.s\n" - "fmax z30.s, p7/m, z30.s, z14.s\n" - "fmax z31.s, p7/m, z31.s, z14.s\n" - "st1w z9.s, p0, [c_ptr5]\n" - "fmin z29.s, p7/m, z29.s, z15.s\n" - "fmin z30.s, p7/m, z30.s, z15.s\n" - "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" - "fmin z31.s, p7/m, z31.s, z15.s\n" - "uzp1 z12.s, z28.s, z29.s\n" - "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" - "uzp2 z13.s, z28.s, z29.s\n" - "uzp1 z14.s, z30.s, z31.s\n" - "uzp2 z15.s, z30.s, z31.s\n" - "st1w z12.s, p0, [c_ptr6]\n" - "st1w z13.s, p0, [c_ptr7]\n" - "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n" - "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp new file mode 100644 index 0000000000..0150ce8fd9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_s8qa_dot_4x4VL( ARGLIST ); + +class cls_sve_hybrid_s8qa_dot_4x4VL +{ +public: + typedef int8_t operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_s8qa_dot_4x4VL; + + cls_sve_hybrid_s8qa_dot_4x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp new file mode 100644 index 0000000000..2b1448bd65 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp @@ -0,0 +1,1602 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_s8qa_dot_4x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 46f\n" + "cmp %x[M], #0x2\n" + "bgt 31f\n" + "beq 16f\n" + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[col_bias]\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 2f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "add x9, x9, x19\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x9, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "4:" // Height 1: setup done + "mov x28, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "cbnz x28, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x27, #0x10\n" + "ble 10f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "tbnz %x[flags], #31, 9f\n" + "sdot z11.s, z0.b, z15.b\n" + "9:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x10\n" + "bgt 8b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z6.b, z0.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z7.b, z0.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z18.s, z8.b, z0.b[0]\n" + "sdot z19.s, z9.b, z0.b[0]\n" + "ble 11f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "sdot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z17.s, z4.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z18.s, z5.b, z0.b[1]\n" + "addvl x11, x11, #4\n" + "sdot z19.s, z6.b, z0.b[1]\n" + "ble 11f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "sdot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z17.s, z8.b, z0.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z18.s, z9.b, z0.b[2]\n" + "addvl x11, x11, #4\n" + "sdot z19.s, z10.b, z0.b[2]\n" + "ble 11f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "sdot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z17.s, z5.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z18.s, z6.b, z0.b[3]\n" + "sdot z19.s, z7.b, z0.b[3]\n" + "11:" // Height 1: Multiply loop: multiply skip + "tbnz %x[flags], #31, 12f\n" + "sdot z11.s, z0.b, z15.b\n" + "12:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 5b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbnz %x[flags], #31, 13f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z1.s }, p2/Z, [x19]\n" + "neg z1.s, p2/M, z1.s\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x19\n" + "saddv d11, p0, z11.s\n" + "mov z11.s, z11.s[0]\n" + "mul z11.s, p2/M, z11.s, z1.s\n" + "13:" // Height 1: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z16.s, z16.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "tbz %x[flags], #5, 14f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "sqadd z19.s, z19.s, z7.s\n" + "14:" // Height 1: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "15:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 3b\n" + "b 62f\n" + "16:" // Height 2 + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 17f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "add x25, x25, x19\n" + "b 18f\n" + "17:" // Height 2: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "18:" // Height 2: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "19:" // Height 2: setup done + "mov x28, #0x0\n" + "20:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 21f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x28, 22f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 22f\n" + "21:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "22:" // Height 2: input setup done + "cmp x27, #0x10\n" + "ble 25f\n" + "23:" // Height 2: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x24, x24, #0x10\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "tbnz %x[flags], #31, 24f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z12.s, z1.b, z15.b\n" + "24:" // Height 2: Multiply loop: unique 3: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x10\n" + "bgt 23b\n" + "25:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z7.b, z0.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x24, x24, #0x10\n" + "sdot z20.s, z6.b, z1.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z21.s, z7.b, z1.b[0]\n" + "sdot z18.s, z8.b, z0.b[0]\n" + "sdot z22.s, z8.b, z1.b[0]\n" + "sdot z19.s, z9.b, z0.b[0]\n" + "sdot z23.s, z9.b, z1.b[0]\n" + "ble 26f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "sdot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z20.s, z10.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z17.s, z4.b, z0.b[1]\n" + "addvl x11, x11, #4\n" + "sdot z21.s, z4.b, z1.b[1]\n" + "sdot z18.s, z5.b, z0.b[1]\n" + "sdot z22.s, z5.b, z1.b[1]\n" + "sdot z19.s, z6.b, z0.b[1]\n" + "sdot z23.s, z6.b, z1.b[1]\n" + "ble 26f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "sdot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z20.s, z7.b, z1.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z17.s, z8.b, z0.b[2]\n" + "addvl x11, x11, #4\n" + "sdot z21.s, z8.b, z1.b[2]\n" + "sdot z18.s, z9.b, z0.b[2]\n" + "sdot z22.s, z9.b, z1.b[2]\n" + "sdot z19.s, z10.b, z0.b[2]\n" + "sdot z23.s, z10.b, z1.b[2]\n" + "ble 26f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "sdot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "sdot z20.s, z4.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z17.s, z5.b, z0.b[3]\n" + "addvl x11, x11, #4\n" + "sdot z21.s, z5.b, z1.b[3]\n" + "sdot z18.s, z6.b, z0.b[3]\n" + "sdot z22.s, z6.b, z1.b[3]\n" + "sdot z19.s, z7.b, z0.b[3]\n" + "sdot z23.s, z7.b, z1.b[3]\n" + "26:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 27f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z12.s, z1.b, z15.b\n" + "27:" // Height 2: Multiply loop: unique 4: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 20b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbnz %x[flags], #31, 28f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z2.s }, p2/Z, [x19]\n" + "neg z2.s, p2/M, z2.s\n" + "mov x20, #0x4\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x20\n" + "saddv d11, p0, z11.s\n" + "whilelt p0.s, XZR, x19\n" + "saddv d12, p0, z12.s\n" + "mov z11.s, z11.s[0]\n" + "mov z12.s, z12.s[0]\n" + "mul z11.s, p2/M, z11.s, z2.s\n" + "mul z12.s, p2/M, z12.s, z2.s\n" + "28:" // Height 2: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z20.s, z20.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z21.s, z21.s, z12.s\n" + "add z22.s, z22.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + "tbz %x[flags], #5, 29f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "and z8.d, z20.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "and z9.d, z21.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z10.d, z22.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "and z4.d, z23.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z20.s, z20.s, z8.s\n" + "sqadd z21.s, z21.s, z9.s\n" + "sqadd z22.s, z22.s, z10.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "29:" // Height 2: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "add z21.s, z21.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "addvl x25, x25, #1\n" + "30:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 18b\n" + "b 62f\n" + "31:" // Height 3 + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 32f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "b 33f\n" + "32:" // Height 3: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "33:" // Height 3: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "34:" // Height 3: setup done + "mov x28, #0x0\n" + "35:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "cbnz x28, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 37f\n" + "36:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "37:" // Height 3: input setup done + "cmp x27, #0x10\n" + "ble 40f\n" + "38:" // Height 3: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "sdot z24.s, z8.b, z2.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z27.s, z4.b, z2.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z25.s, z6.b, z2.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z27.s, z8.b, z2.b[2]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "tbnz %x[flags], #31, 39f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z12.s, z1.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "39:" // Height 3: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "bgt 38b\n" + "40:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z20.s, z6.b, z1.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "sdot z24.s, z6.b, z2.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z21.s, z7.b, z1.b[0]\n" + "sdot z25.s, z7.b, z2.b[0]\n" + "sdot z18.s, z8.b, z0.b[0]\n" + "sdot z22.s, z8.b, z1.b[0]\n" + "sdot z26.s, z8.b, z2.b[0]\n" + "sdot z19.s, z9.b, z0.b[0]\n" + "sdot z23.s, z9.b, z1.b[0]\n" + "sdot z27.s, z9.b, z2.b[0]\n" + "ble 41f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "sdot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z20.s, z10.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z24.s, z10.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z17.s, z4.b, z0.b[1]\n" + "sdot z21.s, z4.b, z1.b[1]\n" + "sdot z25.s, z4.b, z2.b[1]\n" + "sdot z18.s, z5.b, z0.b[1]\n" + "sdot z22.s, z5.b, z1.b[1]\n" + "sdot z26.s, z5.b, z2.b[1]\n" + "sdot z19.s, z6.b, z0.b[1]\n" + "sdot z23.s, z6.b, z1.b[1]\n" + "sdot z27.s, z6.b, z2.b[1]\n" + "ble 41f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "sdot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z20.s, z7.b, z1.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z24.s, z7.b, z2.b[2]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z17.s, z8.b, z0.b[2]\n" + "sdot z21.s, z8.b, z1.b[2]\n" + "sdot z25.s, z8.b, z2.b[2]\n" + "sdot z18.s, z9.b, z0.b[2]\n" + "sdot z22.s, z9.b, z1.b[2]\n" + "sdot z26.s, z9.b, z2.b[2]\n" + "sdot z19.s, z10.b, z0.b[2]\n" + "sdot z23.s, z10.b, z1.b[2]\n" + "sdot z27.s, z10.b, z2.b[2]\n" + "ble 41f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "sdot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "sdot z20.s, z4.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z24.s, z4.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z17.s, z5.b, z0.b[3]\n" + "sdot z21.s, z5.b, z1.b[3]\n" + "sdot z25.s, z5.b, z2.b[3]\n" + "sdot z18.s, z6.b, z0.b[3]\n" + "sdot z22.s, z6.b, z1.b[3]\n" + "sdot z26.s, z6.b, z2.b[3]\n" + "sdot z19.s, z7.b, z0.b[3]\n" + "sdot z23.s, z7.b, z1.b[3]\n" + "sdot z27.s, z7.b, z2.b[3]\n" + "41:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 42f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z12.s, z1.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "42:" // Height 3: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 35b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbnz %x[flags], #31, 43f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z3.s }, p2/Z, [x19]\n" + "neg z3.s, p2/M, z3.s\n" + "mov x20, #0x4\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x20\n" + "saddv d11, p0, z11.s\n" + "whilelt p0.s, XZR, x19\n" + "saddv d12, p0, z12.s\n" + "mov x19, #0x4\n" + "mov z11.s, z11.s[0]\n" + "whilelt p0.s, XZR, x19\n" + "mov z12.s, z12.s[0]\n" + "saddv d13, p0, z13.s\n" + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" + "mov z13.s, z13.s[0]\n" + "mul z13.s, p2/M, z13.s, z3.s\n" + "43:" // Height 3: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z20.s, z20.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z21.s, z21.s, z12.s\n" + "add z22.s, z22.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "add z24.s, z24.s, z13.s\n" + "add z25.s, z25.s, z13.s\n" + "add z26.s, z26.s, z13.s\n" + "add z27.s, z27.s, z13.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "tbz %x[flags], #5, 44f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "and z8.d, z20.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "and z9.d, z21.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z10.d, z22.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "and z4.d, z23.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z24.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z20.s, z20.s, z8.s\n" + "sqadd z21.s, z21.s, z9.s\n" + "sqadd z22.s, z22.s, z10.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "and z6.d, z25.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z24.s, z24.s, z5.s\n" + "and z7.d, z26.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "and z8.d, z27.d, z0.d\n" + "sqadd z25.s, z25.s, z6.s\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z26.s, z26.s, z7.s\n" + "sqadd z27.s, z27.s, z8.s\n" + "44:" // Height 3: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "add z21.s, z21.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "add z26.s, z26.s, z4.s\n" + "addvl x25, x25, #1\n" + "add z27.s, z27.s, z4.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x23]\n" + "addvl x23, x23, #1\n" + "45:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 33b\n" + "b 62f\n" + "46:" // Height 4 + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 47f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "ldr x21, [%x[output_ptr], #0x18]\n" + "add x25, x25, x19\n" + "add %x[output_ptr], %x[output_ptr], #0x20\n" + "add x23, x23, x19\n" + "add x21, x21, x19\n" + "b 48f\n" + "47:" // Height 4: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "add x21, x23, x19\n" + "add %x[output_ptr], x21, x19\n" + "48:" // Height 4: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "49:" // Height 4: setup done + "mov x28, #0x0\n" + "50:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 51f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x28, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 52f\n" + "51:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "52:" // Height 4: input setup done + "cmp x27, #0x10\n" + "ble 55f\n" + "53:" // Height 4: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "sdot z28.s, z4.b, z3.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "sdot z29.s, z5.b, z3.b[0]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "sdot z30.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "sdot z31.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "sdot z24.s, z8.b, z2.b[1]\n" + "sdot z28.s, z8.b, z3.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "sdot z29.s, z9.b, z3.b[1]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "sdot z30.s, z10.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z27.s, z4.b, z2.b[1]\n" + "sdot z31.s, z4.b, z3.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "sdot z28.s, z5.b, z3.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z25.s, z6.b, z2.b[2]\n" + "sdot z29.s, z6.b, z3.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z30.s, z7.b, z3.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z27.s, z8.b, z2.b[2]\n" + "sdot z31.s, z8.b, z3.b[2]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "sdot z28.s, z9.b, z3.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z29.s, z10.b, z3.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z30.s, z4.b, z3.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "sdot z31.s, z5.b, z3.b[3]\n" + "tbnz %x[flags], #31, 54f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z12.s, z1.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "sdot z14.s, z3.b, z15.b\n" + "54:" // Height 4: Multiply loop: unique 7: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 53b\n" + "55:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "sdot z16.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z17.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z20.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "sdot z24.s, z6.b, z2.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "sdot z21.s, z7.b, z1.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z28.s, z6.b, z3.b[0]\n" + "sdot z25.s, z7.b, z2.b[0]\n" + "sdot z29.s, z7.b, z3.b[0]\n" + "sdot z18.s, z8.b, z0.b[0]\n" + "sdot z22.s, z8.b, z1.b[0]\n" + "sdot z26.s, z8.b, z2.b[0]\n" + "sdot z30.s, z8.b, z3.b[0]\n" + "sdot z19.s, z9.b, z0.b[0]\n" + "sdot z23.s, z9.b, z1.b[0]\n" + "sdot z27.s, z9.b, z2.b[0]\n" + "sdot z31.s, z9.b, z3.b[0]\n" + "ble 56f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "sdot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z20.s, z10.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z24.s, z10.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z28.s, z10.b, z3.b[1]\n" + "sdot z17.s, z4.b, z0.b[1]\n" + "sdot z21.s, z4.b, z1.b[1]\n" + "sdot z25.s, z4.b, z2.b[1]\n" + "sdot z29.s, z4.b, z3.b[1]\n" + "sdot z18.s, z5.b, z0.b[1]\n" + "sdot z22.s, z5.b, z1.b[1]\n" + "sdot z26.s, z5.b, z2.b[1]\n" + "sdot z30.s, z5.b, z3.b[1]\n" + "sdot z19.s, z6.b, z0.b[1]\n" + "sdot z23.s, z6.b, z1.b[1]\n" + "sdot z27.s, z6.b, z2.b[1]\n" + "sdot z31.s, z6.b, z3.b[1]\n" + "ble 56f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "sdot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "sdot z20.s, z7.b, z1.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z24.s, z7.b, z2.b[2]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z28.s, z7.b, z3.b[2]\n" + "sdot z17.s, z8.b, z0.b[2]\n" + "sdot z21.s, z8.b, z1.b[2]\n" + "sdot z25.s, z8.b, z2.b[2]\n" + "sdot z29.s, z8.b, z3.b[2]\n" + "sdot z18.s, z9.b, z0.b[2]\n" + "sdot z22.s, z9.b, z1.b[2]\n" + "sdot z26.s, z9.b, z2.b[2]\n" + "sdot z30.s, z9.b, z3.b[2]\n" + "sdot z19.s, z10.b, z0.b[2]\n" + "sdot z23.s, z10.b, z1.b[2]\n" + "sdot z27.s, z10.b, z2.b[2]\n" + "sdot z31.s, z10.b, z3.b[2]\n" + "ble 56f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "sdot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "sdot z20.s, z4.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z24.s, z4.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "sdot z28.s, z4.b, z3.b[3]\n" + "sdot z17.s, z5.b, z0.b[3]\n" + "sdot z21.s, z5.b, z1.b[3]\n" + "sdot z25.s, z5.b, z2.b[3]\n" + "sdot z29.s, z5.b, z3.b[3]\n" + "sdot z18.s, z6.b, z0.b[3]\n" + "sdot z22.s, z6.b, z1.b[3]\n" + "sdot z26.s, z6.b, z2.b[3]\n" + "sdot z30.s, z6.b, z3.b[3]\n" + "sdot z19.s, z7.b, z0.b[3]\n" + "sdot z23.s, z7.b, z1.b[3]\n" + "sdot z27.s, z7.b, z2.b[3]\n" + "sdot z31.s, z7.b, z3.b[3]\n" + "56:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 57f\n" + "sdot z11.s, z0.b, z15.b\n" + "sdot z12.s, z1.b, z15.b\n" + "sdot z13.s, z2.b, z15.b\n" + "sdot z14.s, z3.b, z15.b\n" + "57:" // Height 4: Multiply loop: unique 8: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 50b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 58f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "neg z4.s, p2/M, z4.s\n" + "mov x20, #0x4\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x20\n" + "saddv d11, p0, z11.s\n" + "whilelt p0.s, XZR, x19\n" + "saddv d12, p0, z12.s\n" + "mov x19, #0x4\n" + "mov z11.s, z11.s[0]\n" + "whilelt p0.s, XZR, x19\n" + "mov x19, #0x4\n" + "mov z12.s, z12.s[0]\n" + "saddv d13, p0, z13.s\n" + "whilelt p0.s, XZR, x19\n" + "mul z11.s, p2/M, z11.s, z4.s\n" + "saddv d14, p0, z14.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" + "mov z13.s, z13.s[0]\n" + "mul z13.s, p2/M, z13.s, z4.s\n" + "mov z14.s, z14.s[0]\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "58:" // Height 4: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z20.s, z20.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z21.s, z21.s, z12.s\n" + "add z22.s, z22.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "add z24.s, z24.s, z13.s\n" + "add z25.s, z25.s, z13.s\n" + "add z26.s, z26.s, z13.s\n" + "add z27.s, z27.s, z13.s\n" + "add z28.s, z28.s, z14.s\n" + "add z29.s, z29.s, z14.s\n" + "add z30.s, z30.s, z14.s\n" + "add z31.s, z31.s, z14.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "add z28.s, z28.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z29.s, z29.s, z1.s\n" + "add z30.s, z30.s, z2.s\n" + "add z31.s, z31.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" + ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + "tbz %x[flags], #5, 59f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "and z8.d, z20.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "and z9.d, z21.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z10.d, z22.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "and z4.d, z23.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z24.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z20.s, z20.s, z8.s\n" + "sqadd z21.s, z21.s, z9.s\n" + "sqadd z22.s, z22.s, z10.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "and z6.d, z25.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z24.s, z24.s, z5.s\n" + "and z7.d, z26.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "and z8.d, z27.d, z0.d\n" + "and z9.d, z28.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z25.s, z25.s, z6.s\n" + "and z10.d, z29.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "and z4.d, z30.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z26.s, z26.s, z7.s\n" + "and z5.d, z31.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z27.s, z27.s, z8.s\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z28.s, z28.s, z9.s\n" + "sqadd z29.s, z29.s, z10.s\n" + "sqadd z30.s, z30.s, z4.s\n" + "sqadd z31.s, z31.s, z5.s\n" + "59:" // Height 4: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "add z21.s, z21.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "add z26.s, z26.s, z4.s\n" + "addvl x25, x25, #1\n" + "add z27.s, z27.s, z4.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "add z28.s, z28.s, z4.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" + ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "add z29.s, z29.s, z4.s\n" + "add z30.s, z30.s, z4.s\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x23]\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "addvl x23, x23, #1\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "add z31.s, z31.s, z4.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "uzp1 z28.h, z28.h, z29.h\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "uzp1 z29.h, z30.h, z31.h\n" + "uzp1 z28.b, z28.b, z29.b\n" + "st1b { z28.b }, p1, [x21]\n" + "addvl x21, x21, #1\n" + "60:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 48b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 62f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 61f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "61:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "62:" // Exit + + : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp new file mode 100644 index 0000000000..d8562898aa --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_s8qs_dot_6x4VL( ARGLIST ); + +class cls_sve_hybrid_s8qs_dot_6x4VL +{ +public: + typedef int8_t operand_type; + typedef int8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_s8qs_dot_6x4VL; + + cls_sve_hybrid_s8qs_dot_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp new file mode 100644 index 0000000000..4a4af6356c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp @@ -0,0 +1,2770 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_s8qs_dot_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base +) +{ + struct KernelArgs { + const int32_t *multiplier_ptr = {}; + const int32_t *shift_ptr = {}; + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->per_channel_requant) { + flags |= 0x10; + ka.multiplier_ptr=qp->per_channel_muls + col_base; + ka.shift_ptr=qp->per_channel_right_shifts + col_base; + } + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 71f\n" + "cmp %x[M], #0x4\n" + "bgt 57f\n" + "beq 43f\n" + "cmp %x[M], #0x2\n" + "bgt 29f\n" + "beq 15f\n" + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x15\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "4:" // Height 1: setup done + "mov x12, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x11, #0x10\n" + "ble 9f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "cmp x11, #0x10\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "bgt 8b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "ble 10f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "addvl x14, x14, #4\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "ble 10f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "addvl x14, x14, #4\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "ble 10f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "10:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 5b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ld1w { z0.s }, p2/Z, [x16]\n" + "add z8.s, z8.s, z0.s\n" + "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "add z9.s, z9.s, z1.s\n" + "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" + "addvl x16, x16, #4\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "tbz %x[flags], #4, 11f\n" + "ld1w { z0.s }, p2/Z, [x17]\n" + "ld1w { z4.s }, p2/Z, [x8]\n" + "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" + "addvl x17, x17, #4\n" + "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" + "addvl x8, x8, #4\n" + "b 12f\n" + "11:" // Height 1: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x19]\n" + "mov z1.d, z0.d\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "12:" // Height 1: parameters loaded + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + "tbz %x[flags], #5, 13f\n" + "and z4.d, z8.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z9.d, z1.d\n" + "and z6.d, z10.d, z2.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z11.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z8.s, z8.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z9.s, z9.s, z5.s\n" + "sqadd z10.s, z10.s, z6.s\n" + "sqadd z11.s, z11.s, z7.s\n" + "13:" // Height 1: no shift correction + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + "uzp1 z9.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x13]\n" + "addvl x13, x13, #1\n" + "14:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 3b\n" + "b 86f\n" + "15:" // Height 2 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 16f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "b 17f\n" + "16:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "17:" // Height 2: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x15\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "18:" // Height 2: setup done + "mov x12, #0x0\n" + "19:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 21f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "21:" // Height 2: input setup done + "cmp x11, #0x10\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "cmp x11, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "ble 24f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ble 24f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "ble 24f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "24:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 19b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "ld1w { z0.s }, p2/Z, [x16]\n" + "add z8.s, z8.s, z0.s\n" + "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "add z12.s, z12.s, z0.s\n" + "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" + "add z9.s, z9.s, z1.s\n" + "addvl x16, x16, #4\n" + "add z13.s, z13.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "tbz %x[flags], #4, 25f\n" + "ld1w { z0.s }, p2/Z, [x17]\n" + "ld1w { z4.s }, p2/Z, [x8]\n" + "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" + "addvl x17, x17, #4\n" + "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" + "addvl x8, x8, #4\n" + "b 26f\n" + "25:" // Height 2: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x19]\n" + "mov z1.d, z0.d\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "26:" // Height 2: parameters loaded + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" + "tbz %x[flags], #5, 27f\n" + "and z4.d, z8.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z9.d, z1.d\n" + "and z6.d, z10.d, z2.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z11.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z8.s, z8.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "and z4.d, z12.d, z0.d\n" + "sqadd z9.s, z9.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "and z5.d, z13.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z11.s, z11.s, z7.s\n" + "and z6.d, z14.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z12.s, z12.s, z4.s\n" + "and z7.d, z15.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z13.s, z13.s, z5.s\n" + "sqadd z14.s, z14.s, z6.s\n" + "sqadd z15.s, z15.s, z7.s\n" + "27:" // Height 2: no shift correction + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x13]\n" + "add z13.s, z13.s, z4.s\n" + "addvl x13, x13, #1\n" + ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" + ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "uzp1 z12.h, z12.h, z13.h\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z13.b\n" + "st1b { z12.b }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "28:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 17b\n" + "b 86f\n" + "29:" // Height 3 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 30f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "add x27, x27, x19\n" + "b 31f\n" + "30:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "31:" // Height 3: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x15\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "32:" // Height 3: setup done + "mov x12, #0x0\n" + "33:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 34f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 35f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "b 35f\n" + "34:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "35:" // Height 3: input setup done + "cmp x11, #0x10\n" + "ble 37f\n" + "36:" // Height 3: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "cmp x11, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "bgt 36b\n" + "37:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "add x26, x26, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "ble 38f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "ble 38f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "ble 38f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "38:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 33b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "ld1w { z0.s }, p2/Z, [x16]\n" + "add z8.s, z8.s, z0.s\n" + "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "add z12.s, z12.s, z0.s\n" + "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "add z16.s, z16.s, z0.s\n" + "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" + "addvl x16, x16, #4\n" + "add z9.s, z9.s, z1.s\n" + "add z13.s, z13.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "tbz %x[flags], #4, 39f\n" + "ld1w { z0.s }, p2/Z, [x17]\n" + "ld1w { z4.s }, p2/Z, [x8]\n" + "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" + "addvl x17, x17, #4\n" + "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" + "addvl x8, x8, #4\n" + "b 40f\n" + "39:" // Height 3: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x19]\n" + "mov z1.d, z0.d\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "40:" // Height 3: parameters loaded + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + "tbz %x[flags], #5, 41f\n" + "and z4.d, z8.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z9.d, z1.d\n" + "and z6.d, z10.d, z2.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z11.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z8.s, z8.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "and z4.d, z12.d, z0.d\n" + "sqadd z9.s, z9.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "and z5.d, z13.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z11.s, z11.s, z7.s\n" + "and z6.d, z14.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z12.s, z12.s, z4.s\n" + "and z7.d, z15.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z13.s, z13.s, z5.s\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z14.s, z14.s, z6.s\n" + "and z5.d, z17.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z15.s, z15.s, z7.s\n" + "and z6.d, z18.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z7.d, z19.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "sqadd z19.s, z19.s, z7.s\n" + "41:" // Height 3: no shift correction + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x13]\n" + "add z13.s, z13.s, z4.s\n" + "addvl x13, x13, #1\n" + ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" + ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z12.h, z12.h, z13.h\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + "uzp1 z12.b, z12.b, z13.b\n" + "st1b { z12.b }, p1, [x9]\n" + "add z18.s, z18.s, z4.s\n" + "addvl x9, x9, #1\n" + "add z19.s, z19.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x27]\n" + "addvl x27, x27, #1\n" + "42:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 31b\n" + "b 86f\n" + "43:" // Height 4 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 44f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19\n" + "add x25, x25, x19\n" + "b 45f\n" + "44:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "add x25, x27, x19\n" + "45:" // Height 4: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x15\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "46:" // Height 4: setup done + "mov x12, #0x0\n" + "47:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 48f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 49f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 49f\n" + "48:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "49:" // Height 4: input setup done + "cmp x11, #0x10\n" + "ble 51f\n" + "50:" // Height 4: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "bgt 50b\n" + "51:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "ble 52f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "ble 52f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "ble 52f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "52:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 47b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "ld1w { z0.s }, p2/Z, [x16]\n" + "add z8.s, z8.s, z0.s\n" + "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "add z12.s, z12.s, z0.s\n" + "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "add z16.s, z16.s, z0.s\n" + "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" + "addvl x16, x16, #4\n" + "add z9.s, z9.s, z1.s\n" + "add z13.s, z13.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "tbz %x[flags], #4, 53f\n" + "ld1w { z0.s }, p2/Z, [x17]\n" + "ld1w { z4.s }, p2/Z, [x8]\n" + "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" + "addvl x17, x17, #4\n" + "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" + "addvl x8, x8, #4\n" + "b 54f\n" + "53:" // Height 4: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x19]\n" + "mov z1.d, z0.d\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "54:" // Height 4: parameters loaded + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n" + ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" + ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" + "tbz %x[flags], #5, 55f\n" + "and z4.d, z8.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z9.d, z1.d\n" + "and z6.d, z10.d, z2.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z11.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z8.s, z8.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "and z4.d, z12.d, z0.d\n" + "sqadd z9.s, z9.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "and z5.d, z13.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z11.s, z11.s, z7.s\n" + "and z6.d, z14.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z12.s, z12.s, z4.s\n" + "and z7.d, z15.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z13.s, z13.s, z5.s\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z14.s, z14.s, z6.s\n" + "and z5.d, z17.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z15.s, z15.s, z7.s\n" + "and z6.d, z18.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z7.d, z19.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "and z4.d, z20.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "and z5.d, z21.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "and z6.d, z22.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z20.s, z20.s, z4.s\n" + "and z7.d, z23.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z21.s, z21.s, z5.s\n" + "sqadd z22.s, z22.s, z6.s\n" + "sqadd z23.s, z23.s, z7.s\n" + "55:" // Height 4: no shift correction + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x13]\n" + "add z13.s, z13.s, z4.s\n" + "addvl x13, x13, #1\n" + ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" + ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z12.h, z12.h, z13.h\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + "uzp1 z12.b, z12.b, z13.b\n" + "st1b { z12.b }, p1, [x9]\n" + "add z18.s, z18.s, z4.s\n" + "addvl x9, x9, #1\n" + "add z19.s, z19.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "add z20.s, z20.s, z4.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" + ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x27]\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "addvl x27, x27, #1\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "add z23.s, z23.s, z4.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "addvl x25, x25, #1\n" + "56:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 45b\n" + "b 86f\n" + "57:" // Height 5 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 58f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "b 59f\n" + "58:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "add x25, x27, x19\n" + "add x23, x25, x19\n" + "59:" // Height 5: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x15\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "60:" // Height 5: setup done + "mov x12, #0x0\n" + "61:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 62f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 63f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 63f\n" + "62:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "63:" // Height 5: input setup done + "cmp x11, #0x10\n" + "ble 65f\n" + "64:" // Height 5: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x22, x22, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "bgt 64b\n" + "65:" // Height 5: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "add x22, x22, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "ble 66f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "ble 66f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "ble 66f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "66:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 61b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "ld1w { z0.s }, p2/Z, [x16]\n" + "add z8.s, z8.s, z0.s\n" + "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "add z12.s, z12.s, z0.s\n" + "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "add z16.s, z16.s, z0.s\n" + "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" + "addvl x16, x16, #4\n" + "add z9.s, z9.s, z1.s\n" + "add z13.s, z13.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "tbz %x[flags], #4, 67f\n" + "ld1w { z0.s }, p2/Z, [x17]\n" + "ld1w { z4.s }, p2/Z, [x8]\n" + "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" + "addvl x17, x17, #4\n" + "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" + "addvl x8, x8, #4\n" + "b 68f\n" + "67:" // Height 5: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x19]\n" + "mov z1.d, z0.d\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "68:" // Height 5: parameters loaded + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n" + ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" + ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" + ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" + "tbz %x[flags], #5, 69f\n" + "and z4.d, z8.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z9.d, z1.d\n" + "and z6.d, z10.d, z2.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z11.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z8.s, z8.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "and z4.d, z12.d, z0.d\n" + "sqadd z9.s, z9.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "and z5.d, z13.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z11.s, z11.s, z7.s\n" + "and z6.d, z14.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z12.s, z12.s, z4.s\n" + "and z7.d, z15.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z13.s, z13.s, z5.s\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z14.s, z14.s, z6.s\n" + "and z5.d, z17.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z15.s, z15.s, z7.s\n" + "and z6.d, z18.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z7.d, z19.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "and z4.d, z20.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "and z5.d, z21.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "and z6.d, z22.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z20.s, z20.s, z4.s\n" + "and z7.d, z23.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z21.s, z21.s, z5.s\n" + "and z4.d, z24.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z22.s, z22.s, z6.s\n" + "and z5.d, z25.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z23.s, z23.s, z7.s\n" + "and z6.d, z26.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z24.s, z24.s, z4.s\n" + "and z7.d, z27.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z25.s, z25.s, z5.s\n" + "sqadd z26.s, z26.s, z6.s\n" + "sqadd z27.s, z27.s, z7.s\n" + "69:" // Height 5: no shift correction + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x13]\n" + "add z13.s, z13.s, z4.s\n" + "addvl x13, x13, #1\n" + ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" + ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z12.h, z12.h, z13.h\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + "uzp1 z12.b, z12.b, z13.b\n" + "st1b { z12.b }, p1, [x9]\n" + "add z18.s, z18.s, z4.s\n" + "addvl x9, x9, #1\n" + "add z19.s, z19.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "add z20.s, z20.s, z4.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" + ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x27]\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "addvl x27, x27, #1\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "add z23.s, z23.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "add z26.s, z26.s, z4.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "add z27.s, z27.s, z4.s\n" + "addvl x25, x25, #1\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x23]\n" + "addvl x23, x23, #1\n" + "70:" // Height 5: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 59b\n" + "b 86f\n" + "71:" // Height 6 + "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x16, %x[col_bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 72f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "add x21, x21, x19\n" + "b 73f\n" + "72:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19\n" + "add x27, x9, x19\n" + "add x25, x27, x19\n" + "add x23, x25, x19\n" + "add x21, x23, x19\n" + "add %x[output_ptr], x21, x19\n" + "73:" // Height 6: Column loop + "mov z8.s, #0x0\n" + "mov x19, #0x0\n" + "mov z9.s, #0x0\n" + "whilelt p1.b, x19, x15\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "74:" // Height 6: setup done + "mov x12, #0x0\n" + "75:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 76f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 77f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 77f\n" + "76:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "77:" // Height 6: input setup done + "cmp x11, #0x10\n" + "ble 79f\n" + "78:" // Height 6: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z5.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x20, x20, #0x10\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "sdot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sdot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z30.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z31.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "sdot z28.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "sdot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z30.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z31.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "sdot z28.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "sdot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z30.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z31.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "sdot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "sdot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z30.s, z6.b, z5.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z31.s, z7.b, z5.b[3]\n" + "bgt 78b\n" + "79:" // Height 6: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z5.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "add x20, x20, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "sdot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "sdot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z30.s, z6.b, z5.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z31.s, z7.b, z5.b[0]\n" + "ble 80f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "sdot z28.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "sdot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z30.s, z6.b, z5.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z31.s, z7.b, z5.b[1]\n" + "ble 80f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "sdot z28.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "sdot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z30.s, z6.b, z5.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z31.s, z7.b, z5.b[2]\n" + "ble 80f\n" + "ld1b { z6.b }, p2/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "sdot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "sdot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z30.s, z6.b, z5.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z31.s, z7.b, z5.b[3]\n" + "80:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 75b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "ld1w { z0.s }, p2/Z, [x16]\n" + "add z8.s, z8.s, z0.s\n" + "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "add z12.s, z12.s, z0.s\n" + "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "add z16.s, z16.s, z0.s\n" + "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" + "addvl x16, x16, #4\n" + "add z9.s, z9.s, z1.s\n" + "add z13.s, z13.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "add z28.s, z28.s, z0.s\n" + "add z29.s, z29.s, z1.s\n" + "add z30.s, z30.s, z2.s\n" + "add z31.s, z31.s, z3.s\n" + "tbz %x[flags], #4, 81f\n" + "ld1w { z0.s }, p2/Z, [x17]\n" + "ld1w { z4.s }, p2/Z, [x8]\n" + "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" + "addvl x17, x17, #4\n" + "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" + "addvl x8, x8, #4\n" + "b 82f\n" + "81:" // Height 6: per layer parameters + "add x19, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x19]\n" + "mov z1.d, z0.d\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "mov z2.d, z0.d\n" + "mov z3.d, z0.d\n" + "mov z5.d, z4.d\n" + "mov z6.d, z4.d\n" + "mov z7.d, z4.d\n" + "82:" // Height 6: parameters loaded + ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" + ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" + ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n" + ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" + ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" + ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" + ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n" + ".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n" + ".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n" + "tbz %x[flags], #5, 83f\n" + "and z4.d, z8.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z9.d, z1.d\n" + "and z6.d, z10.d, z2.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z11.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z8.s, z8.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "and z4.d, z12.d, z0.d\n" + "sqadd z9.s, z9.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z10.s, z10.s, z6.s\n" + "and z5.d, z13.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z11.s, z11.s, z7.s\n" + "and z6.d, z14.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z12.s, z12.s, z4.s\n" + "and z7.d, z15.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z13.s, z13.s, z5.s\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z14.s, z14.s, z6.s\n" + "and z5.d, z17.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z15.s, z15.s, z7.s\n" + "and z6.d, z18.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z7.d, z19.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "and z4.d, z20.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "and z5.d, z21.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "and z6.d, z22.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z20.s, z20.s, z4.s\n" + "and z7.d, z23.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z21.s, z21.s, z5.s\n" + "and z4.d, z24.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z22.s, z22.s, z6.s\n" + "and z5.d, z25.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z23.s, z23.s, z7.s\n" + "and z6.d, z26.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z24.s, z24.s, z4.s\n" + "and z7.d, z27.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z25.s, z25.s, z5.s\n" + "and z4.d, z28.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z26.s, z26.s, z6.s\n" + "and z5.d, z29.d, z1.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z27.s, z27.s, z7.s\n" + "and z6.d, z30.d, z2.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z28.s, z28.s, z4.s\n" + "and z7.d, z31.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z29.s, z29.s, z5.s\n" + "sqadd z30.s, z30.s, z6.s\n" + "sqadd z31.s, z31.s, z7.s\n" + "83:" // Height 6: no shift correction + ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "st1b { z8.b }, p1, [x13]\n" + "add z13.s, z13.s, z4.s\n" + "addvl x13, x13, #1\n" + ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" + ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "uzp1 z12.h, z12.h, z13.h\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + "uzp1 z12.b, z12.b, z13.b\n" + "st1b { z12.b }, p1, [x9]\n" + "add z18.s, z18.s, z4.s\n" + "addvl x9, x9, #1\n" + "add z19.s, z19.s, z4.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "add z20.s, z20.s, z4.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" + ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x27]\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "addvl x27, x27, #1\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "add z23.s, z23.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "add z26.s, z26.s, z4.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "add z27.s, z27.s, z4.s\n" + "addvl x25, x25, #1\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" + ".inst 0x4482883d // srshl z29.s, p2/M, z29.s, z1.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n" + "add z28.s, z28.s, z4.s\n" + "add z29.s, z29.s, z4.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "add z30.s, z30.s, z4.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x23]\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "addvl x23, x23, #1\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n" + "uzp1 z28.h, z28.h, z29.h\n" + "add z31.s, z31.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "uzp1 z29.h, z30.h, z31.h\n" + "uzp1 z28.b, z28.b, z29.b\n" + "st1b { z28.b }, p1, [x21]\n" + "addvl x21, x21, #1\n" + "84:" // Height 6: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 73b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 86f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 85f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "85:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "86:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "p0", "p1", "p2", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp deleted file mode 100644 index c500f43fe0..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - -class hybrid_s8s32_dot_4VLx4 -{ -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return get_vector_length() * 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_s8s32_dot_4VLx4; - - hybrid_s8s32_dot_4VLx4(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp deleted file mode 100644 index b30b8845a6..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp +++ /dev/null @@ -1,2137 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 16) / 32) - 1; - K -= loops_count * 32; - const long regs_count = (K / 16) - 1; - K -= (regs_count + 1) * 16; - const long leftovers = K; - const long blocks_count = (K + 3) / 4; - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = blocks_count; - const int8_t *a_ptr0 = a_ptr0_base; - const int8_t *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(int32_t); - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z18.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z19.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "subs %[loops], %[loops], #0x1\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "b 5f\n" - "4:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "mov z18.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z19.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z20.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z21.s, #0\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z22.s, #0\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z23.s, #0\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "subs %[loops], %[loops], #0x1\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "b 5f\n" - "4:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "mov z18.s, #0\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "mov z19.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z20.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z21.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z22.s, #0\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z23.s, #0\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z24.s, #0\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z25.s, #0\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z26.s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "mov z27.s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "subs %[loops], %[loops], #0x1\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "sdot z24.s, z8.b, z6.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "sdot z25.s, z9.b, z6.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "sdot z26.s, z10.b, z6.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "sdot z27.s, z11.b, z6.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "sdot z24.s, z12.b, z6.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "sdot z25.s, z13.b, z6.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "sdot z26.s, z14.b, z6.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "sdot z27.s, z15.b, z6.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z24.s, z8.b, z6.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z25.s, z9.b, z6.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z26.s, z10.b, z6.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "sdot z27.s, z11.b, z6.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z24.s, z12.b, z6.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z25.s, z13.b, z6.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z26.s, z14.b, z6.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "sdot z27.s, z15.b, z6.b[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "sdot z24.s, z8.b, z6.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "sdot z25.s, z9.b, z6.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "sdot z26.s, z10.b, z6.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "sdot z27.s, z11.b, z6.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "sdot z24.s, z12.b, z6.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "sdot z25.s, z13.b, z6.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "sdot z26.s, z14.b, z6.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "sdot z27.s, z15.b, z6.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z24.s, z8.b, z6.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z25.s, z9.b, z6.b[2]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z26.s, z10.b, z6.b[2]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "sdot z27.s, z11.b, z6.b[2]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z24.s, z12.b, z6.b[3]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z25.s, z13.b, z6.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z26.s, z14.b, z6.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "sdot z27.s, z15.b, z6.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "b 5f\n" - "4:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "sdot z24.s, z8.b, z6.b[0]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "sdot z25.s, z9.b, z6.b[0]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "sdot z26.s, z10.b, z6.b[0]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "sdot z27.s, z11.b, z6.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "sdot z24.s, z12.b, z6.b[1]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "sdot z25.s, z13.b, z6.b[1]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "sdot z26.s, z14.b, z6.b[1]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "sdot z27.s, z15.b, z6.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z24.s, z8.b, z6.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z25.s, z9.b, z6.b[2]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z26.s, z10.b, z6.b[2]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "sdot z27.s, z11.b, z6.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z24.s, z12.b, z6.b[3]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z25.s, z13.b, z6.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z26.s, z14.b, z6.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "sdot z27.s, z15.b, z6.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "st1w z24.s, p0, [c_ptr2]\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "mov z18.s, #0\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "mov z19.s, #0\n" - "ld1rqb z3.b, p7/z, [a_ptr3]\n" - "mov z20.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z21.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z22.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z23.s, #0\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z24.s, #0\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z25.s, #0\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z26.s, #0\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z27.s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "mov z28.s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "mov z29.s, #0\n" - "add a_ptr2, a_ptr2, #0x10\n" - "mov z30.s, #0\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z31.s, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1w z28.s, p0/z, [c_ptr3]\n" - "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" - "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqb z3.b, p7/z, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "sdot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "subs %[loops], %[loops], #0x1\n" - "sdot z29.s, z9.b, z3.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "sdot z30.s, z10.b, z3.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "sdot z31.s, z11.b, z3.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "sdot z28.s, z12.b, z3.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "sdot z29.s, z13.b, z3.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "sdot z30.s, z14.b, z3.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "sdot z31.s, z15.b, z3.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z28.s, z8.b, z3.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "sdot z29.s, z9.b, z3.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "sdot z30.s, z10.b, z3.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "sdot z31.s, z11.b, z3.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "sdot z28.s, z12.b, z3.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "sdot z29.s, z13.b, z3.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "sdot z30.s, z14.b, z3.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" - "sdot z31.s, z15.b, z3.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "sdot z24.s, z8.b, z6.b[0]\n" - "sdot z28.s, z8.b, z7.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "sdot z25.s, z9.b, z6.b[0]\n" - "sdot z29.s, z9.b, z7.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "sdot z26.s, z10.b, z6.b[0]\n" - "sdot z30.s, z10.b, z7.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "sdot z27.s, z11.b, z6.b[0]\n" - "sdot z31.s, z11.b, z7.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "sdot z24.s, z12.b, z6.b[1]\n" - "sdot z28.s, z12.b, z7.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "sdot z25.s, z13.b, z6.b[1]\n" - "sdot z29.s, z13.b, z7.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "sdot z26.s, z14.b, z6.b[1]\n" - "sdot z30.s, z14.b, z7.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "sdot z27.s, z15.b, z6.b[1]\n" - "sdot z31.s, z15.b, z7.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z24.s, z8.b, z6.b[2]\n" - "sdot z28.s, z8.b, z7.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z25.s, z9.b, z6.b[2]\n" - "sdot z29.s, z9.b, z7.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z26.s, z10.b, z6.b[2]\n" - "sdot z30.s, z10.b, z7.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "sdot z27.s, z11.b, z6.b[2]\n" - "sdot z31.s, z11.b, z7.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z24.s, z12.b, z6.b[3]\n" - "sdot z28.s, z12.b, z7.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z25.s, z13.b, z6.b[3]\n" - "sdot z29.s, z13.b, z7.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z26.s, z14.b, z6.b[3]\n" - "sdot z30.s, z14.b, z7.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "sdot z27.s, z15.b, z6.b[3]\n" - "sdot z31.s, z15.b, z7.b[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "sdot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "sdot z29.s, z9.b, z3.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "sdot z30.s, z10.b, z3.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "sdot z31.s, z11.b, z3.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "sdot z28.s, z12.b, z3.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "sdot z29.s, z13.b, z3.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "sdot z30.s, z14.b, z3.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "sdot z31.s, z15.b, z3.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z28.s, z8.b, z3.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "sdot z29.s, z9.b, z3.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "sdot z30.s, z10.b, z3.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "sdot z31.s, z11.b, z3.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "sdot z28.s, z12.b, z3.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "sdot z29.s, z13.b, z3.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "sdot z30.s, z14.b, z3.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" - "sdot z31.s, z15.b, z3.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "sdot z24.s, z8.b, z6.b[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "sdot z28.s, z8.b, z7.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "addvl a_ptr3, a_ptr3, #2\n" - "sdot z25.s, z9.b, z6.b[0]\n" - "sdot z29.s, z9.b, z7.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "sdot z26.s, z10.b, z6.b[0]\n" - "sdot z30.s, z10.b, z7.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "sdot z27.s, z11.b, z6.b[0]\n" - "sdot z31.s, z11.b, z7.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "sdot z24.s, z12.b, z6.b[1]\n" - "sdot z28.s, z12.b, z7.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "sdot z25.s, z13.b, z6.b[1]\n" - "sdot z29.s, z13.b, z7.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "sdot z26.s, z14.b, z6.b[1]\n" - "sdot z30.s, z14.b, z7.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "sdot z27.s, z15.b, z6.b[1]\n" - "sdot z31.s, z15.b, z7.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z24.s, z8.b, z6.b[2]\n" - "sdot z28.s, z8.b, z7.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z25.s, z9.b, z6.b[2]\n" - "sdot z29.s, z9.b, z7.b[2]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z26.s, z10.b, z6.b[2]\n" - "sdot z30.s, z10.b, z7.b[2]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "sdot z27.s, z11.b, z6.b[2]\n" - "sdot z31.s, z11.b, z7.b[2]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z24.s, z12.b, z6.b[3]\n" - "sdot z28.s, z12.b, z7.b[3]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z25.s, z13.b, z6.b[3]\n" - "sdot z29.s, z13.b, z7.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z26.s, z14.b, z6.b[3]\n" - "sdot z30.s, z14.b, z7.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "sdot z27.s, z15.b, z6.b[3]\n" - "sdot z31.s, z15.b, z7.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "sdot z28.s, z8.b, z3.b[0]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "sdot z29.s, z9.b, z3.b[0]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "sdot z30.s, z10.b, z3.b[0]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "sdot z31.s, z11.b, z3.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "sdot z28.s, z12.b, z3.b[1]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "sdot z29.s, z13.b, z3.b[1]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "sdot z30.s, z14.b, z3.b[1]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "sdot z31.s, z15.b, z3.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z28.s, z8.b, z3.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "sdot z29.s, z9.b, z3.b[2]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "sdot z30.s, z10.b, z3.b[2]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "sdot z31.s, z11.b, z3.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "sdot z28.s, z12.b, z3.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "sdot z29.s, z13.b, z3.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "sdot z30.s, z14.b, z3.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "sdot z31.s, z15.b, z3.b[3]\n" - "b 5f\n" - "4:\n" - "sdot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "sdot z28.s, z8.b, z3.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "sdot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1rqb z7.b, p6/z, [a_ptr3]\n" - "sdot z25.s, z9.b, z2.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "sdot z29.s, z9.b, z3.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "sdot z26.s, z10.b, z2.b[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" - "sdot z30.s, z10.b, z3.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "sdot z23.s, z11.b, z1.b[0]\n" - "sdot z27.s, z11.b, z2.b[0]\n" - "sdot z31.s, z11.b, z3.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z16.s, z12.b, z0.b[1]\n" - "sdot z20.s, z12.b, z1.b[1]\n" - "sdot z24.s, z12.b, z2.b[1]\n" - "sdot z28.s, z12.b, z3.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "sdot z21.s, z13.b, z1.b[1]\n" - "sdot z25.s, z13.b, z2.b[1]\n" - "sdot z29.s, z13.b, z3.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "sdot z18.s, z14.b, z0.b[1]\n" - "sdot z22.s, z14.b, z1.b[1]\n" - "sdot z26.s, z14.b, z2.b[1]\n" - "sdot z30.s, z14.b, z3.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z19.s, z15.b, z0.b[1]\n" - "sdot z23.s, z15.b, z1.b[1]\n" - "sdot z27.s, z15.b, z2.b[1]\n" - "sdot z31.s, z15.b, z3.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "sdot z20.s, z8.b, z1.b[2]\n" - "sdot z24.s, z8.b, z2.b[2]\n" - "sdot z28.s, z8.b, z3.b[2]\n" - "sdot z17.s, z9.b, z0.b[2]\n" - "sdot z21.s, z9.b, z1.b[2]\n" - "sdot z25.s, z9.b, z2.b[2]\n" - "sdot z29.s, z9.b, z3.b[2]\n" - "sdot z18.s, z10.b, z0.b[2]\n" - "sdot z22.s, z10.b, z1.b[2]\n" - "sdot z26.s, z10.b, z2.b[2]\n" - "sdot z30.s, z10.b, z3.b[2]\n" - "sdot z19.s, z11.b, z0.b[2]\n" - "sdot z23.s, z11.b, z1.b[2]\n" - "sdot z27.s, z11.b, z2.b[2]\n" - "sdot z31.s, z11.b, z3.b[2]\n" - "sdot z16.s, z12.b, z0.b[3]\n" - "sdot z20.s, z12.b, z1.b[3]\n" - "sdot z24.s, z12.b, z2.b[3]\n" - "sdot z28.s, z12.b, z3.b[3]\n" - "sdot z17.s, z13.b, z0.b[3]\n" - "sdot z21.s, z13.b, z1.b[3]\n" - "sdot z25.s, z13.b, z2.b[3]\n" - "sdot z29.s, z13.b, z3.b[3]\n" - "sdot z18.s, z14.b, z0.b[3]\n" - "sdot z22.s, z14.b, z1.b[3]\n" - "sdot z26.s, z14.b, z2.b[3]\n" - "sdot z30.s, z14.b, z3.b[3]\n" - "sdot z19.s, z15.b, z0.b[3]\n" - "sdot z23.s, z15.b, z1.b[3]\n" - "sdot z27.s, z15.b, z2.b[3]\n" - "sdot z31.s, z15.b, z3.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "sdot z20.s, z8.b, z5.b[0]\n" - "sdot z24.s, z8.b, z6.b[0]\n" - "sdot z28.s, z8.b, z7.b[0]\n" - "sdot z17.s, z9.b, z4.b[0]\n" - "sdot z21.s, z9.b, z5.b[0]\n" - "sdot z25.s, z9.b, z6.b[0]\n" - "sdot z29.s, z9.b, z7.b[0]\n" - "sdot z18.s, z10.b, z4.b[0]\n" - "sdot z22.s, z10.b, z5.b[0]\n" - "sdot z26.s, z10.b, z6.b[0]\n" - "sdot z30.s, z10.b, z7.b[0]\n" - "sdot z19.s, z11.b, z4.b[0]\n" - "sdot z23.s, z11.b, z5.b[0]\n" - "sdot z27.s, z11.b, z6.b[0]\n" - "sdot z31.s, z11.b, z7.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "sdot z20.s, z12.b, z5.b[1]\n" - "sdot z24.s, z12.b, z6.b[1]\n" - "sdot z28.s, z12.b, z7.b[1]\n" - "sdot z17.s, z13.b, z4.b[1]\n" - "sdot z21.s, z13.b, z5.b[1]\n" - "sdot z25.s, z13.b, z6.b[1]\n" - "sdot z29.s, z13.b, z7.b[1]\n" - "sdot z18.s, z14.b, z4.b[1]\n" - "sdot z22.s, z14.b, z5.b[1]\n" - "sdot z26.s, z14.b, z6.b[1]\n" - "sdot z30.s, z14.b, z7.b[1]\n" - "sdot z19.s, z15.b, z4.b[1]\n" - "sdot z23.s, z15.b, z5.b[1]\n" - "sdot z27.s, z15.b, z6.b[1]\n" - "sdot z31.s, z15.b, z7.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "sdot z16.s, z8.b, z4.b[2]\n" - "sdot z20.s, z8.b, z5.b[2]\n" - "sdot z24.s, z8.b, z6.b[2]\n" - "sdot z28.s, z8.b, z7.b[2]\n" - "sdot z17.s, z9.b, z4.b[2]\n" - "sdot z21.s, z9.b, z5.b[2]\n" - "sdot z25.s, z9.b, z6.b[2]\n" - "sdot z29.s, z9.b, z7.b[2]\n" - "sdot z18.s, z10.b, z4.b[2]\n" - "sdot z22.s, z10.b, z5.b[2]\n" - "sdot z26.s, z10.b, z6.b[2]\n" - "sdot z30.s, z10.b, z7.b[2]\n" - "sdot z19.s, z11.b, z4.b[2]\n" - "sdot z23.s, z11.b, z5.b[2]\n" - "sdot z27.s, z11.b, z6.b[2]\n" - "sdot z31.s, z11.b, z7.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "sdot z16.s, z12.b, z4.b[3]\n" - "sdot z20.s, z12.b, z5.b[3]\n" - "sdot z24.s, z12.b, z6.b[3]\n" - "sdot z28.s, z12.b, z7.b[3]\n" - "sdot z17.s, z13.b, z4.b[3]\n" - "sdot z21.s, z13.b, z5.b[3]\n" - "sdot z25.s, z13.b, z6.b[3]\n" - "sdot z29.s, z13.b, z7.b[3]\n" - "sdot z18.s, z14.b, z4.b[3]\n" - "sdot z22.s, z14.b, z5.b[3]\n" - "sdot z26.s, z14.b, z6.b[3]\n" - "sdot z30.s, z14.b, z7.b[3]\n" - "sdot z19.s, z15.b, z4.b[3]\n" - "sdot z23.s, z15.b, z5.b[3]\n" - "sdot z27.s, z15.b, z6.b[3]\n" - "sdot z31.s, z15.b, z7.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "st1w z24.s, p0, [c_ptr2]\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - "st1w z28.s, p0, [c_ptr3]\n" - "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" - "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" - "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp new file mode 100644 index 0000000000..1aebedb861 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const int8_t *, \ + IndirectOutputArg, \ + const int32_t *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_s8s32_dot_6x4VL( ARGLIST ); + +class cls_sve_hybrid_s8s32_dot_6x4VL +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_s8s32_dot_6x4VL; + + cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp new file mode 100644 index 0000000000..cae9bf329f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp @@ -0,0 +1,1904 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_s8s32_dot_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const int32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 61f\n" + "cmp %x[M], #0x4\n" + "bgt 49f\n" + "beq 37f\n" + "cmp %x[M], #0x2\n" + "bgt 25f\n" + "beq 13f\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 4f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "5:" // Height 1: setup done + "mov x12, #0x0\n" + "6:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x11, #0x10\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "cmp x11, #0x10\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "ble 11f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "addvl x14, x14, #4\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "ble 11f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "addvl x14, x14, #4\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "ble 11f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "11:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 6b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "12:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 3b\n" + "b 74f\n" + "13:" // Height 2 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 14f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 15f\n" + "14:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "15:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 16f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "b 17f\n" + "16:" // Height 2: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "17:" // Height 2: setup done + "mov x12, #0x0\n" + "18:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 20f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "20:" // Height 2: input setup done + "cmp x11, #0x10\n" + "ble 22f\n" + "21:" // Height 2: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "cmp x11, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "bgt 21b\n" + "22:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "ble 23f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ble 23f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "ble 23f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "23:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 18b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "24:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 15b\n" + "b 74f\n" + "25:" // Height 3 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 26f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 27f\n" + "26:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "27:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 28f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "b 29f\n" + "28:" // Height 3: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "29:" // Height 3: setup done + "mov x12, #0x0\n" + "30:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 32f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "b 32f\n" + "31:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "32:" // Height 3: input setup done + "cmp x11, #0x10\n" + "ble 34f\n" + "33:" // Height 3: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "cmp x11, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "bgt 33b\n" + "34:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "add x26, x26, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "ble 35f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "ble 35f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "ble 35f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "35:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 30b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "36:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 27b\n" + "b 74f\n" + "37:" // Height 4 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 38f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 39f\n" + "38:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "39:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 40f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "b 41f\n" + "40:" // Height 4: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "41:" // Height 4: setup done + "mov x12, #0x0\n" + "42:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 43f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 44f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 44f\n" + "43:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "44:" // Height 4: input setup done + "cmp x11, #0x10\n" + "ble 46f\n" + "45:" // Height 4: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "bgt 45b\n" + "46:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "ble 47f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "ble 47f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "ble 47f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "47:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 42b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "48:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 39b\n" + "b 74f\n" + "49:" // Height 5 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 50f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 51f\n" + "50:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "51:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 52f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 53f\n" + "52:" // Height 5: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "53:" // Height 5: setup done + "mov x12, #0x0\n" + "54:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 55f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 56f\n" + "55:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "56:" // Height 5: input setup done + "cmp x11, #0x10\n" + "ble 58f\n" + "57:" // Height 5: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x22, x22, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "bgt 57b\n" + "58:" // Height 5: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "add x22, x22, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "ble 59f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "ble 59f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "ble 59f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "59:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 54b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "60:" // Height 5: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 51b\n" + "b 74f\n" + "61:" // Height 6 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 62f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 63f\n" + "62:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "63:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 64f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x21]\n" + "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 65f\n" + "64:" // Height 6: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "65:" // Height 6: setup done + "mov x12, #0x0\n" + "66:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 67f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 68f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 68f\n" + "67:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "68:" // Height 6: input setup done + "cmp x11, #0x10\n" + "ble 70f\n" + "69:" // Height 6: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z5.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x20, x20, #0x10\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "sdot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sdot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z30.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z31.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "sdot z28.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "sdot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z30.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z31.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "sdot z28.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "sdot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z30.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z31.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "sdot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "sdot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z30.s, z6.b, z5.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z31.s, z7.b, z5.b[3]\n" + "bgt 69b\n" + "70:" // Height 6: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z5.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "add x20, x20, #0x10\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "sdot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "sdot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z30.s, z6.b, z5.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z31.s, z7.b, z5.b[0]\n" + "ble 71f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "sdot z28.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "sdot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z30.s, z6.b, z5.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z31.s, z7.b, z5.b[1]\n" + "ble 71f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "sdot z28.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "sdot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z30.s, z6.b, z5.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z31.s, z7.b, z5.b[2]\n" + "ble 71f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "sdot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "sdot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z30.s, z6.b, z5.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z31.s, z7.b, z5.b[3]\n" + "71:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 66b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "st1w { z28.s }, p4, [x21]\n" + "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" + "addvl x21, x21, #4\n" + "72:" // Height 6: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 63b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 74f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 73f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "73:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "74:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp new file mode 100644 index 0000000000..964f7cc2c1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const Requantize32 *, const int32_t *, unsigned int + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_u8qa_dot_4x4VL( ARGLIST ); + +class cls_sve_hybrid_u8qa_dot_4x4VL +{ +public: + typedef uint8_t operand_type; + typedef uint8_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_u8qa_dot_4x4VL; + + cls_sve_hybrid_u8qa_dot_4x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp new file mode 100644 index 0000000000..0a6546b78a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp @@ -0,0 +1,1602 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void sve_hybrid_u8qa_dot_4x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + "ptrue p2.b\n" + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 46f\n" + "cmp %x[M], #0x2\n" + "bgt 31f\n" + "beq 16f\n" + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[col_bias]\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 2f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "add x9, x9, x19\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x9, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "4:" // Height 1: setup done + "mov x28, #0x0\n" + "5:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "cbnz x28, 7f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x27, #0x10\n" + "ble 10f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "udot z17.s, z9.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "tbnz %x[flags], #31, 9f\n" + "udot z11.s, z0.b, z15.b\n" + "9:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x10\n" + "bgt 8b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z6.b, z0.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z7.b, z0.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z18.s, z8.b, z0.b[0]\n" + "udot z19.s, z9.b, z0.b[0]\n" + "ble 11f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "udot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z17.s, z4.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z18.s, z5.b, z0.b[1]\n" + "addvl x11, x11, #4\n" + "udot z19.s, z6.b, z0.b[1]\n" + "ble 11f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "udot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z17.s, z8.b, z0.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z18.s, z9.b, z0.b[2]\n" + "addvl x11, x11, #4\n" + "udot z19.s, z10.b, z0.b[2]\n" + "ble 11f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "udot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z17.s, z5.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z18.s, z6.b, z0.b[3]\n" + "udot z19.s, z7.b, z0.b[3]\n" + "11:" // Height 1: Multiply loop: multiply skip + "tbnz %x[flags], #31, 12f\n" + "udot z11.s, z0.b, z15.b\n" + "12:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 5b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbnz %x[flags], #31, 13f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z1.s }, p2/Z, [x19]\n" + "neg z1.s, p2/M, z1.s\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x19\n" + "uaddv d11, p0, z11.s\n" + "mov z11.s, z11.s[0]\n" + "mul z11.s, p2/M, z11.s, z1.s\n" + "13:" // Height 1: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z16.s, z16.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "tbz %x[flags], #5, 14f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "sqadd z18.s, z18.s, z6.s\n" + "sqadd z19.s, z19.s, z7.s\n" + "14:" // Height 1: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "15:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 3b\n" + "b 62f\n" + "16:" // Height 2 + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 17f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "add x25, x25, x19\n" + "b 18f\n" + "17:" // Height 2: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "18:" // Height 2: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "19:" // Height 2: setup done + "mov x28, #0x0\n" + "20:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 21f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x28, 22f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 22f\n" + "21:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "22:" // Height 2: input setup done + "cmp x27, #0x10\n" + "ble 25f\n" + "23:" // Height 2: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x24, x24, #0x10\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "udot z23.s, z7.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "tbnz %x[flags], #31, 24f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z12.s, z1.b, z15.b\n" + "24:" // Height 2: Multiply loop: unique 3: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x10\n" + "bgt 23b\n" + "25:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z7.b, z0.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x24, x24, #0x10\n" + "udot z20.s, z6.b, z1.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z21.s, z7.b, z1.b[0]\n" + "udot z18.s, z8.b, z0.b[0]\n" + "udot z22.s, z8.b, z1.b[0]\n" + "udot z19.s, z9.b, z0.b[0]\n" + "udot z23.s, z9.b, z1.b[0]\n" + "ble 26f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "udot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z20.s, z10.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z17.s, z4.b, z0.b[1]\n" + "addvl x11, x11, #4\n" + "udot z21.s, z4.b, z1.b[1]\n" + "udot z18.s, z5.b, z0.b[1]\n" + "udot z22.s, z5.b, z1.b[1]\n" + "udot z19.s, z6.b, z0.b[1]\n" + "udot z23.s, z6.b, z1.b[1]\n" + "ble 26f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "udot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z20.s, z7.b, z1.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z17.s, z8.b, z0.b[2]\n" + "addvl x11, x11, #4\n" + "udot z21.s, z8.b, z1.b[2]\n" + "udot z18.s, z9.b, z0.b[2]\n" + "udot z22.s, z9.b, z1.b[2]\n" + "udot z19.s, z10.b, z0.b[2]\n" + "udot z23.s, z10.b, z1.b[2]\n" + "ble 26f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "udot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "udot z20.s, z4.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z17.s, z5.b, z0.b[3]\n" + "addvl x11, x11, #4\n" + "udot z21.s, z5.b, z1.b[3]\n" + "udot z18.s, z6.b, z0.b[3]\n" + "udot z22.s, z6.b, z1.b[3]\n" + "udot z19.s, z7.b, z0.b[3]\n" + "udot z23.s, z7.b, z1.b[3]\n" + "26:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 27f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z12.s, z1.b, z15.b\n" + "27:" // Height 2: Multiply loop: unique 4: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 20b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbnz %x[flags], #31, 28f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z2.s }, p2/Z, [x19]\n" + "neg z2.s, p2/M, z2.s\n" + "mov x20, #0x4\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x20\n" + "uaddv d11, p0, z11.s\n" + "whilelt p0.s, XZR, x19\n" + "uaddv d12, p0, z12.s\n" + "mov z11.s, z11.s[0]\n" + "mov z12.s, z12.s[0]\n" + "mul z11.s, p2/M, z11.s, z2.s\n" + "mul z12.s, p2/M, z12.s, z2.s\n" + "28:" // Height 2: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z20.s, z20.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z21.s, z21.s, z12.s\n" + "add z22.s, z22.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + "tbz %x[flags], #5, 29f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "and z8.d, z20.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "and z9.d, z21.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z10.d, z22.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "and z4.d, z23.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z20.s, z20.s, z8.s\n" + "sqadd z21.s, z21.s, z9.s\n" + "sqadd z22.s, z22.s, z10.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "29:" // Height 2: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "add z21.s, z21.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "addvl x25, x25, #1\n" + "30:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 18b\n" + "b 62f\n" + "31:" // Height 3 + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 32f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "add x25, x25, x19\n" + "add x23, x23, x19\n" + "b 33f\n" + "32:" // Height 3: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "33:" // Height 3: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "34:" // Height 3: setup done + "mov x28, #0x0\n" + "35:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 36f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "cbnz x28, 37f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 37f\n" + "36:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "37:" // Height 3: input setup done + "cmp x27, #0x10\n" + "ble 40f\n" + "38:" // Height 3: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "udot z24.s, z4.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "udot z25.s, z5.b, z2.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "udot z26.s, z6.b, z2.b[0]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "udot z24.s, z8.b, z2.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "udot z27.s, z4.b, z2.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z25.s, z6.b, z2.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z27.s, z8.b, z2.b[2]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "tbnz %x[flags], #31, 39f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z12.s, z1.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "39:" // Height 3: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "bgt 38b\n" + "40:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z20.s, z6.b, z1.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "udot z24.s, z6.b, z2.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z21.s, z7.b, z1.b[0]\n" + "udot z25.s, z7.b, z2.b[0]\n" + "udot z18.s, z8.b, z0.b[0]\n" + "udot z22.s, z8.b, z1.b[0]\n" + "udot z26.s, z8.b, z2.b[0]\n" + "udot z19.s, z9.b, z0.b[0]\n" + "udot z23.s, z9.b, z1.b[0]\n" + "udot z27.s, z9.b, z2.b[0]\n" + "ble 41f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "udot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z20.s, z10.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z24.s, z10.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z17.s, z4.b, z0.b[1]\n" + "udot z21.s, z4.b, z1.b[1]\n" + "udot z25.s, z4.b, z2.b[1]\n" + "udot z18.s, z5.b, z0.b[1]\n" + "udot z22.s, z5.b, z1.b[1]\n" + "udot z26.s, z5.b, z2.b[1]\n" + "udot z19.s, z6.b, z0.b[1]\n" + "udot z23.s, z6.b, z1.b[1]\n" + "udot z27.s, z6.b, z2.b[1]\n" + "ble 41f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "udot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z20.s, z7.b, z1.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z24.s, z7.b, z2.b[2]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z17.s, z8.b, z0.b[2]\n" + "udot z21.s, z8.b, z1.b[2]\n" + "udot z25.s, z8.b, z2.b[2]\n" + "udot z18.s, z9.b, z0.b[2]\n" + "udot z22.s, z9.b, z1.b[2]\n" + "udot z26.s, z9.b, z2.b[2]\n" + "udot z19.s, z10.b, z0.b[2]\n" + "udot z23.s, z10.b, z1.b[2]\n" + "udot z27.s, z10.b, z2.b[2]\n" + "ble 41f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "udot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "udot z20.s, z4.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z24.s, z4.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z17.s, z5.b, z0.b[3]\n" + "udot z21.s, z5.b, z1.b[3]\n" + "udot z25.s, z5.b, z2.b[3]\n" + "udot z18.s, z6.b, z0.b[3]\n" + "udot z22.s, z6.b, z1.b[3]\n" + "udot z26.s, z6.b, z2.b[3]\n" + "udot z19.s, z7.b, z0.b[3]\n" + "udot z23.s, z7.b, z1.b[3]\n" + "udot z27.s, z7.b, z2.b[3]\n" + "41:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 42f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z12.s, z1.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "42:" // Height 3: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 35b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbnz %x[flags], #31, 43f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z3.s }, p2/Z, [x19]\n" + "neg z3.s, p2/M, z3.s\n" + "mov x20, #0x4\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x20\n" + "uaddv d11, p0, z11.s\n" + "whilelt p0.s, XZR, x19\n" + "uaddv d12, p0, z12.s\n" + "mov x19, #0x4\n" + "mov z11.s, z11.s[0]\n" + "whilelt p0.s, XZR, x19\n" + "mov z12.s, z12.s[0]\n" + "uaddv d13, p0, z13.s\n" + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" + "mov z13.s, z13.s[0]\n" + "mul z13.s, p2/M, z13.s, z3.s\n" + "43:" // Height 3: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z20.s, z20.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z21.s, z21.s, z12.s\n" + "add z22.s, z22.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "add z24.s, z24.s, z13.s\n" + "add z25.s, z25.s, z13.s\n" + "add z26.s, z26.s, z13.s\n" + "add z27.s, z27.s, z13.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "tbz %x[flags], #5, 44f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "and z8.d, z20.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "and z9.d, z21.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z10.d, z22.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "and z4.d, z23.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z24.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z20.s, z20.s, z8.s\n" + "sqadd z21.s, z21.s, z9.s\n" + "sqadd z22.s, z22.s, z10.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "and z6.d, z25.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z24.s, z24.s, z5.s\n" + "and z7.d, z26.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "and z8.d, z27.d, z0.d\n" + "sqadd z25.s, z25.s, z6.s\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z26.s, z26.s, z7.s\n" + "sqadd z27.s, z27.s, z8.s\n" + "44:" // Height 3: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "add z21.s, z21.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "add z26.s, z26.s, z4.s\n" + "addvl x25, x25, #1\n" + "add z27.s, z27.s, z4.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x23]\n" + "addvl x23, x23, #1\n" + "45:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 33b\n" + "b 62f\n" + "46:" // Height 4 + "mov z11.s, #0x0\n" + "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" + "mov x10, %x[col_bias]\n" + "mov z12.s, #0x0\n" + "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov z13.s, #0x0\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov z14.s, #0x0\n" + "mov z15.b, #0x1\n" + "tbz %x[flags], #2, 47f\n" + "ldr x9, [%x[output_ptr], #0x0]\n" + "ldr x25, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19\n" + "ldr x23, [%x[output_ptr], #0x10]\n" + "ldr x21, [%x[output_ptr], #0x18]\n" + "add x25, x25, x19\n" + "add %x[output_ptr], %x[output_ptr], #0x20\n" + "add x23, x23, x19\n" + "add x21, x21, x19\n" + "b 48f\n" + "47:" // Height 4: setup direct output + "mov x9, %x[output_ptr]\n" + "add x25, x9, x19\n" + "add x23, x25, x19\n" + "add x21, x23, x19\n" + "add %x[output_ptr], x21, x19\n" + "48:" // Height 4: Column loop + "mov z16.s, #0x0\n" + "mov x19, #0x0\n" + "mov z17.s, #0x0\n" + "whilelt p1.b, x19, x12\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "49:" // Height 4: setup done + "mov x28, #0x0\n" + "50:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 51f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x28, 52f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 52f\n" + "51:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "52:" // Height 4: input setup done + "cmp x27, #0x10\n" + "ble 55f\n" + "53:" // Height 4: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "udot z24.s, z4.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "udot z21.s, z5.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "udot z25.s, z5.b, z2.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "udot z28.s, z4.b, z3.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "udot z29.s, z5.b, z3.b[0]\n" + "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" + "addvl x11, x11, #16\n" + "udot z22.s, z6.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "udot z26.s, z6.b, z2.b[0]\n" + "udot z30.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "udot z31.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "udot z24.s, z8.b, z2.b[1]\n" + "udot z28.s, z8.b, z3.b[1]\n" + "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "udot z29.s, z9.b, z3.b[1]\n" + "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "udot z30.s, z10.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "udot z27.s, z4.b, z2.b[1]\n" + "udot z31.s, z4.b, z3.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "udot z28.s, z5.b, z3.b[2]\n" + "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z25.s, z6.b, z2.b[2]\n" + "udot z29.s, z6.b, z3.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z30.s, z7.b, z3.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z27.s, z8.b, z2.b[2]\n" + "udot z31.s, z8.b, z3.b[2]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "udot z28.s, z9.b, z3.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z29.s, z10.b, z3.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z30.s, z4.b, z3.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "udot z31.s, z5.b, z3.b[3]\n" + "tbnz %x[flags], #31, 54f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z12.s, z1.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "udot z14.s, z3.b, z15.b\n" + "54:" // Height 4: Multiply loop: unique 7: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "bgt 53b\n" + "55:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x11]\n" + "whilelt p0.b, XZR, x27\n" + "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" + "udot z16.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z17.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z20.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "udot z24.s, z6.b, z2.b[0]\n" + "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "add x20, x20, #0x10\n" + "udot z21.s, z7.b, z1.b[0]\n" + "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z28.s, z6.b, z3.b[0]\n" + "udot z25.s, z7.b, z2.b[0]\n" + "udot z29.s, z7.b, z3.b[0]\n" + "udot z18.s, z8.b, z0.b[0]\n" + "udot z22.s, z8.b, z1.b[0]\n" + "udot z26.s, z8.b, z2.b[0]\n" + "udot z30.s, z8.b, z3.b[0]\n" + "udot z19.s, z9.b, z0.b[0]\n" + "udot z23.s, z9.b, z1.b[0]\n" + "udot z27.s, z9.b, z2.b[0]\n" + "udot z31.s, z9.b, z3.b[0]\n" + "ble 56f\n" + "ld1b { z10.b }, p2/Z, [x11]\n" + "udot z16.s, z10.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z20.s, z10.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z24.s, z10.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z28.s, z10.b, z3.b[1]\n" + "udot z17.s, z4.b, z0.b[1]\n" + "udot z21.s, z4.b, z1.b[1]\n" + "udot z25.s, z4.b, z2.b[1]\n" + "udot z29.s, z4.b, z3.b[1]\n" + "udot z18.s, z5.b, z0.b[1]\n" + "udot z22.s, z5.b, z1.b[1]\n" + "udot z26.s, z5.b, z2.b[1]\n" + "udot z30.s, z5.b, z3.b[1]\n" + "udot z19.s, z6.b, z0.b[1]\n" + "udot z23.s, z6.b, z1.b[1]\n" + "udot z27.s, z6.b, z2.b[1]\n" + "udot z31.s, z6.b, z3.b[1]\n" + "ble 56f\n" + "ld1b { z7.b }, p2/Z, [x11]\n" + "udot z16.s, z7.b, z0.b[2]\n" + "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" + "subs x27, x27, #0x4\n" + "udot z20.s, z7.b, z1.b[2]\n" + "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z24.s, z7.b, z2.b[2]\n" + "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z28.s, z7.b, z3.b[2]\n" + "udot z17.s, z8.b, z0.b[2]\n" + "udot z21.s, z8.b, z1.b[2]\n" + "udot z25.s, z8.b, z2.b[2]\n" + "udot z29.s, z8.b, z3.b[2]\n" + "udot z18.s, z9.b, z0.b[2]\n" + "udot z22.s, z9.b, z1.b[2]\n" + "udot z26.s, z9.b, z2.b[2]\n" + "udot z30.s, z9.b, z3.b[2]\n" + "udot z19.s, z10.b, z0.b[2]\n" + "udot z23.s, z10.b, z1.b[2]\n" + "udot z27.s, z10.b, z2.b[2]\n" + "udot z31.s, z10.b, z3.b[2]\n" + "ble 56f\n" + "ld1b { z4.b }, p2/Z, [x11]\n" + "udot z16.s, z4.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" + "udot z20.s, z4.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z24.s, z4.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" + "udot z28.s, z4.b, z3.b[3]\n" + "udot z17.s, z5.b, z0.b[3]\n" + "udot z21.s, z5.b, z1.b[3]\n" + "udot z25.s, z5.b, z2.b[3]\n" + "udot z29.s, z5.b, z3.b[3]\n" + "udot z18.s, z6.b, z0.b[3]\n" + "udot z22.s, z6.b, z1.b[3]\n" + "udot z26.s, z6.b, z2.b[3]\n" + "udot z30.s, z6.b, z3.b[3]\n" + "udot z19.s, z7.b, z0.b[3]\n" + "udot z23.s, z7.b, z1.b[3]\n" + "udot z27.s, z7.b, z2.b[3]\n" + "udot z31.s, z7.b, z3.b[3]\n" + "56:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 57f\n" + "udot z11.s, z0.b, z15.b\n" + "udot z12.s, z1.b, z15.b\n" + "udot z13.s, z2.b, z15.b\n" + "udot z14.s, z3.b, z15.b\n" + "57:" // Height 4: Multiply loop: unique 8: skip row sum + "prfm pldl1keep, [x26, #0x80]\n" + "add x28, x28, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x28, x19\n" + "bne 50b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 58f\n" + "add x19, %x[qp], %[b_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "neg z4.s, p2/M, z4.s\n" + "mov x20, #0x4\n" + "mov x19, #0x4\n" + "whilelt p0.s, XZR, x20\n" + "uaddv d11, p0, z11.s\n" + "whilelt p0.s, XZR, x19\n" + "uaddv d12, p0, z12.s\n" + "mov x19, #0x4\n" + "mov z11.s, z11.s[0]\n" + "whilelt p0.s, XZR, x19\n" + "mov x19, #0x4\n" + "mov z12.s, z12.s[0]\n" + "uaddv d13, p0, z13.s\n" + "whilelt p0.s, XZR, x19\n" + "mul z11.s, p2/M, z11.s, z4.s\n" + "uaddv d14, p0, z14.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" + "mov z13.s, z13.s[0]\n" + "mul z13.s, p2/M, z13.s, z4.s\n" + "mov z14.s, z14.s[0]\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "58:" // Height 4: skip row sum fixup + "add z16.s, z16.s, z11.s\n" + "ld1w { z0.s }, p2/Z, [x10]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z17.s, z17.s, z11.s\n" + "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z18.s, z18.s, z11.s\n" + "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "add x19, %x[qp], %[per_layer_mul]\n" + "add z19.s, z19.s, z11.s\n" + "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" + "add z20.s, z20.s, z12.s\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + "add z21.s, z21.s, z12.s\n" + "add z22.s, z22.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "add z24.s, z24.s, z13.s\n" + "add z25.s, z25.s, z13.s\n" + "add z26.s, z26.s, z13.s\n" + "add z27.s, z27.s, z13.s\n" + "add z28.s, z28.s, z14.s\n" + "add z29.s, z29.s, z14.s\n" + "add z30.s, z30.s, z14.s\n" + "add z31.s, z31.s, z14.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "add z28.s, z28.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z29.s, z29.s, z1.s\n" + "add z30.s, z30.s, z2.s\n" + "add z31.s, z31.s, z3.s\n" + ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" + ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" + ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" + ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + "tbz %x[flags], #5, 59f\n" + "and z4.d, z16.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z17.d, z0.d\n" + "and z6.d, z18.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "and z7.d, z19.d, z0.d\n" + "and z8.d, z20.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "and z9.d, z21.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "sqadd z16.s, z16.s, z4.s\n" + "and z10.d, z22.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "and z4.d, z23.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "sqadd z17.s, z17.s, z5.s\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z18.s, z18.s, z6.s\n" + "asr z4.s, z4.s, #0x1f\n" + "and z5.d, z24.d, z0.d\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z19.s, z19.s, z7.s\n" + "sqadd z20.s, z20.s, z8.s\n" + "sqadd z21.s, z21.s, z9.s\n" + "sqadd z22.s, z22.s, z10.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "and z6.d, z25.d, z0.d\n" + "asr z6.s, z6.s, #0x1f\n" + "sqadd z24.s, z24.s, z5.s\n" + "and z7.d, z26.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "and z8.d, z27.d, z0.d\n" + "and z9.d, z28.d, z0.d\n" + "asr z8.s, z8.s, #0x1f\n" + "sqadd z25.s, z25.s, z6.s\n" + "and z10.d, z29.d, z0.d\n" + "asr z9.s, z9.s, #0x1f\n" + "and z4.d, z30.d, z0.d\n" + "asr z10.s, z10.s, #0x1f\n" + "sqadd z26.s, z26.s, z7.s\n" + "and z5.d, z31.d, z0.d\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z27.s, z27.s, z8.s\n" + "asr z5.s, z5.s, #0x1f\n" + "sqadd z28.s, z28.s, z9.s\n" + "sqadd z29.s, z29.s, z10.s\n" + "sqadd z30.s, z30.s, z4.s\n" + "sqadd z31.s, z31.s, z5.s\n" + "59:" // Height 4: no shift correction + ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" + "add x19, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x19]\n" + ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + "add x19, %x[qp], %[minval]\n" + ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" + "ld1rw { z5.s }, p2/Z, [x19]\n" + "add x19, %x[qp], %[maxval]\n" + ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + "ld1rw { z6.s }, p2/Z, [x19]\n" + ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "uzp1 z16.h, z16.h, z17.h\n" + ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z16.b, z16.b, z17.b\n" + "st1b { z16.b }, p1, [x9]\n" + "add z21.s, z21.s, z4.s\n" + "addvl x9, x9, #1\n" + ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "uzp1 z20.h, z20.h, z21.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z20.b }, p1, [x25]\n" + "add z26.s, z26.s, z4.s\n" + "addvl x25, x25, #1\n" + "add z27.s, z27.s, z4.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "add z28.s, z28.s, z4.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" + ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "add z29.s, z29.s, z4.s\n" + "add z30.s, z30.s, z4.s\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p1, [x23]\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "addvl x23, x23, #1\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "add z31.s, z31.s, z4.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "uzp1 z28.h, z28.h, z29.h\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "uzp1 z29.h, z30.h, z31.h\n" + "uzp1 z28.b, z28.b, z29.b\n" + "st1b { z28.b }, p1, [x21]\n" + "addvl x21, x21, #1\n" + "60:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x12, x12, x19\n" + "bgt 48b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 62f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 61f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "61:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "62:" // Exit + + : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp deleted file mode 100644 index c325e522d7..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include -#include "../std_transforms_sve.hpp" - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - -class hybrid_u8u32_dot_4VLx4 -{ -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 4; - } - - static unsigned int out_width() - { - return get_vector_length() * 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return true; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_hybrid_u8u32_dot_4VLx4; - - hybrid_u8u32_dot_4VLx4(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp deleted file mode 100644 index 565832e8de..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp +++ /dev/null @@ -1,2137 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) { - const int K_stride = ((K + 3) / 4) * 4; - const long loops_count = ((K + 16) / 32) - 1; - K -= loops_count * 32; - const long regs_count = (K / 16) - 1; - K -= (regs_count + 1) * 16; - const long leftovers = K; - const long blocks_count = (K + 3) / 4; - - int rows_to_compute; - - for (int y=0; y 4) { - if (rows_to_compute % 4) { - rows_to_compute = 4 - 1; - } else { - rows_to_compute = 4; - } - } - - for (int x0=0; x0())) { - const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); - long loops = loops_count; - long regs = regs_count; - long temp = 0; - long blocks = blocks_count; - const uint8_t *a_ptr0 = a_ptr0_base; - const uint8_t *b_ptr0 = B + (K_stride * x0); - const unsigned long ldcb = ldc * sizeof(uint32_t); - - switch(rows_to_compute) { - case 1: - __asm __volatile ( - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z18.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z19.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "subs %[loops], %[loops], #0x1\n" - "udot z19.s, z11.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z16.s, z12.b, z0.b[3]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "udot z17.s, z13.b, z0.b[3]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z18.s, z14.b, z4.b[3]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "udot z19.s, z15.b, z4.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "b 5f\n" - "4:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z19.s, z11.b, z0.b[2]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "c_ptr1 .req X1\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "mov z18.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z19.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z20.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z21.s, #0\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z22.s, #0\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z23.s, #0\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "add a_ptr1, a_ptr1, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "subs %[loops], %[loops], #0x1\n" - "udot z23.s, z11.b, z1.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "udot z20.s, z12.b, z1.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "udot z21.s, z13.b, z1.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "udot z21.s, z9.b, z5.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "udot z23.s, z11.b, z1.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "udot z21.s, z9.b, z5.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "udot z22.s, z10.b, z5.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "udot z23.s, z11.b, z1.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "b 5f\n" - "4:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "udot z23.s, z11.b, z1.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "udot z20.s, z12.b, z1.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "udot z21.s, z9.b, z5.b[0]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq c_ptr1\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "c_ptr1 .req X2\n" - "c_ptr2 .req X3\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "mov z18.s, #0\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "mov z19.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z20.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z21.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z22.s, #0\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z23.s, #0\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z24.s, #0\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z25.s, #0\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z26.s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "mov z27.s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "add a_ptr2, a_ptr2, #0x10\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "subs %[loops], %[loops], #0x1\n" - "udot z22.s, z10.b, z1.b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "udot z26.s, z10.b, z2.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "udot z23.s, z11.b, z1.b[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "udot z27.s, z11.b, z2.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "udot z24.s, z8.b, z6.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "udot z21.s, z9.b, z5.b[0]\n" - "udot z25.s, z9.b, z6.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "udot z26.s, z10.b, z6.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "udot z27.s, z11.b, z6.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "udot z24.s, z12.b, z6.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "udot z25.s, z13.b, z6.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "udot z26.s, z14.b, z6.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "udot z27.s, z15.b, z6.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z24.s, z8.b, z6.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z25.s, z9.b, z6.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z26.s, z10.b, z6.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "udot z27.s, z11.b, z6.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z24.s, z12.b, z6.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z25.s, z13.b, z6.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z26.s, z14.b, z6.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "udot z27.s, z15.b, z6.b[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "udot z26.s, z10.b, z2.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "udot z23.s, z11.b, z1.b[0]\n" - "udot z27.s, z11.b, z2.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "udot z24.s, z8.b, z6.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "udot z21.s, z9.b, z5.b[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "udot z25.s, z9.b, z6.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "udot z26.s, z10.b, z6.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "udot z27.s, z11.b, z6.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "udot z24.s, z12.b, z6.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "udot z25.s, z13.b, z6.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "udot z26.s, z14.b, z6.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "udot z27.s, z15.b, z6.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z24.s, z8.b, z6.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z25.s, z9.b, z6.b[2]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z26.s, z10.b, z6.b[2]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "udot z27.s, z11.b, z6.b[2]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z24.s, z12.b, z6.b[3]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z25.s, z13.b, z6.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z26.s, z14.b, z6.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "udot z27.s, z15.b, z6.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "udot z26.s, z10.b, z2.b[0]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "udot z23.s, z11.b, z1.b[0]\n" - "udot z27.s, z11.b, z2.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "b 5f\n" - "4:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "udot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "udot z26.s, z10.b, z2.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "udot z23.s, z11.b, z1.b[0]\n" - "udot z27.s, z11.b, z2.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "udot z24.s, z8.b, z6.b[0]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "udot z21.s, z9.b, z5.b[0]\n" - "udot z25.s, z9.b, z6.b[0]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "udot z26.s, z10.b, z6.b[0]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "udot z27.s, z11.b, z6.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "udot z24.s, z12.b, z6.b[1]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "udot z25.s, z13.b, z6.b[1]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "udot z26.s, z14.b, z6.b[1]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "udot z27.s, z15.b, z6.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z24.s, z8.b, z6.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z25.s, z9.b, z6.b[2]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z26.s, z10.b, z6.b[2]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "udot z27.s, z11.b, z6.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z24.s, z12.b, z6.b[3]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z25.s, z13.b, z6.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z26.s, z14.b, z6.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "udot z27.s, z15.b, z6.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "st1w z24.s, p0, [c_ptr2]\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" - ); - break; - default: - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "c_ptr1 .req X3\n" - "c_ptr2 .req X4\n" - "c_ptr3 .req X5\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "whilelt p6.b, %[temp], %[leftovers]\n" - "whilelt p0.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "ptrue p7.b\n" - "whilelt p1.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p2.s, %[temp], %[width]\n" - "incw %[temp], all, mul #1\n" - "whilelt p3.s, %[temp], %[width]\n" - "cbnz %[accumulate], 1f\n" - "mov z16.s, #0\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "mov z17.s, #0\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "mov z18.s, #0\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "mov z19.s, #0\n" - "ld1rqb z3.b, p7/z, [a_ptr3]\n" - "mov z20.s, #0\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "mov z21.s, #0\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "mov z22.s, #0\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "mov z23.s, #0\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "mov z24.s, #0\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "mov z25.s, #0\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "mov z26.s, #0\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "mov z27.s, #0\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "mov z28.s, #0\n" - "add a_ptr1, a_ptr1, #0x10\n" - "mov z29.s, #0\n" - "add a_ptr2, a_ptr2, #0x10\n" - "mov z30.s, #0\n" - "add a_ptr3, a_ptr3, #0x10\n" - "mov z31.s, #0\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "b 3f\n" - "1:\n" - "ld1w z16.s, p0/z, [%[c_ptr0]]\n" - "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p0/z, [c_ptr1]\n" - "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" - "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" - "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" - "ld1w z24.s, p0/z, [c_ptr2]\n" - "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" - "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" - "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" - "ld1w z28.s, p0/z, [c_ptr3]\n" - "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" - "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" - "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" - "add %[a_ptr0], %[a_ptr0], #0x10\n" - "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "add a_ptr1, a_ptr1, #0x10\n" - "ld1rqb z2.b, p7/z, [a_ptr2]\n" - "add a_ptr2, a_ptr2, #0x10\n" - "ld1rqb z3.b, p7/z, [a_ptr3]\n" - "add a_ptr3, a_ptr3, #0x10\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "3:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "udot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "subs %[loops], %[loops], #0x1\n" - "udot z29.s, z9.b, z3.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - "udot z22.s, z10.b, z1.b[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" - "udot z26.s, z10.b, z2.b[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "udot z30.s, z10.b, z3.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" - "udot z23.s, z11.b, z1.b[0]\n" - "udot z27.s, z11.b, z2.b[0]\n" - "udot z31.s, z11.b, z3.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "udot z28.s, z12.b, z3.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "udot z29.s, z13.b, z3.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "udot z30.s, z14.b, z3.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "udot z31.s, z15.b, z3.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z28.s, z8.b, z3.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "udot z29.s, z9.b, z3.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "udot z30.s, z10.b, z3.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "udot z31.s, z11.b, z3.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "udot z28.s, z12.b, z3.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "udot z29.s, z13.b, z3.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "udot z30.s, z14.b, z3.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" - "udot z31.s, z15.b, z3.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "udot z24.s, z8.b, z6.b[0]\n" - "udot z28.s, z8.b, z7.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "udot z21.s, z9.b, z5.b[0]\n" - "udot z25.s, z9.b, z6.b[0]\n" - "udot z29.s, z9.b, z7.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "udot z26.s, z10.b, z6.b[0]\n" - "udot z30.s, z10.b, z7.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "udot z27.s, z11.b, z6.b[0]\n" - "udot z31.s, z11.b, z7.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "udot z24.s, z12.b, z6.b[1]\n" - "udot z28.s, z12.b, z7.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "udot z25.s, z13.b, z6.b[1]\n" - "udot z29.s, z13.b, z7.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "udot z26.s, z14.b, z6.b[1]\n" - "udot z30.s, z14.b, z7.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "udot z27.s, z15.b, z6.b[1]\n" - "udot z31.s, z15.b, z7.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z24.s, z8.b, z6.b[2]\n" - "udot z28.s, z8.b, z7.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z25.s, z9.b, z6.b[2]\n" - "udot z29.s, z9.b, z7.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z26.s, z10.b, z6.b[2]\n" - "udot z30.s, z10.b, z7.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "udot z27.s, z11.b, z6.b[2]\n" - "udot z31.s, z11.b, z7.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z24.s, z12.b, z6.b[3]\n" - "udot z28.s, z12.b, z7.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z25.s, z13.b, z6.b[3]\n" - "udot z29.s, z13.b, z7.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z26.s, z14.b, z6.b[3]\n" - "udot z30.s, z14.b, z7.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "udot z27.s, z15.b, z6.b[3]\n" - "udot z31.s, z15.b, z7.b[3]\n" - "b.ne 3b\n" - "2:\n" - "cbz %[regs], 4f\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "udot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "udot z29.s, z9.b, z3.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "udot z26.s, z10.b, z2.b[0]\n" - "udot z30.s, z10.b, z3.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "udot z23.s, z11.b, z1.b[0]\n" - "udot z27.s, z11.b, z2.b[0]\n" - "udot z31.s, z11.b, z3.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "udot z28.s, z12.b, z3.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "udot z29.s, z13.b, z3.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "udot z30.s, z14.b, z3.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "udot z31.s, z15.b, z3.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z28.s, z8.b, z3.b[2]\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "udot z29.s, z9.b, z3.b[2]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "udot z30.s, z10.b, z3.b[2]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "udot z31.s, z11.b, z3.b[2]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "udot z28.s, z12.b, z3.b[3]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "udot z29.s, z13.b, z3.b[3]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "udot z30.s, z14.b, z3.b[3]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" - "udot z31.s, z15.b, z3.b[3]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "udot z24.s, z8.b, z6.b[0]\n" - "addvl a_ptr1, a_ptr1, #2\n" - "udot z28.s, z8.b, z7.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "addvl a_ptr2, a_ptr2, #2\n" - "udot z21.s, z9.b, z5.b[0]\n" - "addvl a_ptr3, a_ptr3, #2\n" - "udot z25.s, z9.b, z6.b[0]\n" - "udot z29.s, z9.b, z7.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "udot z26.s, z10.b, z6.b[0]\n" - "udot z30.s, z10.b, z7.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "udot z27.s, z11.b, z6.b[0]\n" - "udot z31.s, z11.b, z7.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "udot z24.s, z12.b, z6.b[1]\n" - "udot z28.s, z12.b, z7.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "udot z25.s, z13.b, z6.b[1]\n" - "udot z29.s, z13.b, z7.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "udot z26.s, z14.b, z6.b[1]\n" - "udot z30.s, z14.b, z7.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "udot z27.s, z15.b, z6.b[1]\n" - "udot z31.s, z15.b, z7.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z24.s, z8.b, z6.b[2]\n" - "udot z28.s, z8.b, z7.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z25.s, z9.b, z6.b[2]\n" - "udot z29.s, z9.b, z7.b[2]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z26.s, z10.b, z6.b[2]\n" - "udot z30.s, z10.b, z7.b[2]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "udot z27.s, z11.b, z6.b[2]\n" - "udot z31.s, z11.b, z7.b[2]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z24.s, z12.b, z6.b[3]\n" - "udot z28.s, z12.b, z7.b[3]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z25.s, z13.b, z6.b[3]\n" - "udot z29.s, z13.b, z7.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z26.s, z14.b, z6.b[3]\n" - "udot z30.s, z14.b, z7.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "udot z27.s, z15.b, z6.b[3]\n" - "udot z31.s, z15.b, z7.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "udot z28.s, z8.b, z3.b[0]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "udot z29.s, z9.b, z3.b[0]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "udot z26.s, z10.b, z2.b[0]\n" - "udot z30.s, z10.b, z3.b[0]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "udot z23.s, z11.b, z1.b[0]\n" - "udot z27.s, z11.b, z2.b[0]\n" - "udot z31.s, z11.b, z3.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "udot z28.s, z12.b, z3.b[1]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "udot z29.s, z13.b, z3.b[1]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "udot z30.s, z14.b, z3.b[1]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "udot z31.s, z15.b, z3.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z28.s, z8.b, z3.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "udot z29.s, z9.b, z3.b[2]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "udot z30.s, z10.b, z3.b[2]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "udot z31.s, z11.b, z3.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "udot z28.s, z12.b, z3.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "udot z29.s, z13.b, z3.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "udot z30.s, z14.b, z3.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "udot z31.s, z15.b, z3.b[3]\n" - "b 5f\n" - "4:\n" - "udot z16.s, z8.b, z0.b[0]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "udot z28.s, z8.b, z3.b[0]\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "udot z17.s, z9.b, z0.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1rqb z7.b, p6/z, [a_ptr3]\n" - "udot z25.s, z9.b, z2.b[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - "udot z29.s, z9.b, z3.b[0]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" - "udot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" - "udot z26.s, z10.b, z2.b[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" - "udot z30.s, z10.b, z3.b[0]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "udot z23.s, z11.b, z1.b[0]\n" - "udot z27.s, z11.b, z2.b[0]\n" - "udot z31.s, z11.b, z3.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z16.s, z12.b, z0.b[1]\n" - "udot z20.s, z12.b, z1.b[1]\n" - "udot z24.s, z12.b, z2.b[1]\n" - "udot z28.s, z12.b, z3.b[1]\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "udot z21.s, z13.b, z1.b[1]\n" - "udot z25.s, z13.b, z2.b[1]\n" - "udot z29.s, z13.b, z3.b[1]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "udot z18.s, z14.b, z0.b[1]\n" - "udot z22.s, z14.b, z1.b[1]\n" - "udot z26.s, z14.b, z2.b[1]\n" - "udot z30.s, z14.b, z3.b[1]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z19.s, z15.b, z0.b[1]\n" - "udot z23.s, z15.b, z1.b[1]\n" - "udot z27.s, z15.b, z2.b[1]\n" - "udot z31.s, z15.b, z3.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "udot z20.s, z8.b, z1.b[2]\n" - "udot z24.s, z8.b, z2.b[2]\n" - "udot z28.s, z8.b, z3.b[2]\n" - "udot z17.s, z9.b, z0.b[2]\n" - "udot z21.s, z9.b, z1.b[2]\n" - "udot z25.s, z9.b, z2.b[2]\n" - "udot z29.s, z9.b, z3.b[2]\n" - "udot z18.s, z10.b, z0.b[2]\n" - "udot z22.s, z10.b, z1.b[2]\n" - "udot z26.s, z10.b, z2.b[2]\n" - "udot z30.s, z10.b, z3.b[2]\n" - "udot z19.s, z11.b, z0.b[2]\n" - "udot z23.s, z11.b, z1.b[2]\n" - "udot z27.s, z11.b, z2.b[2]\n" - "udot z31.s, z11.b, z3.b[2]\n" - "udot z16.s, z12.b, z0.b[3]\n" - "udot z20.s, z12.b, z1.b[3]\n" - "udot z24.s, z12.b, z2.b[3]\n" - "udot z28.s, z12.b, z3.b[3]\n" - "udot z17.s, z13.b, z0.b[3]\n" - "udot z21.s, z13.b, z1.b[3]\n" - "udot z25.s, z13.b, z2.b[3]\n" - "udot z29.s, z13.b, z3.b[3]\n" - "udot z18.s, z14.b, z0.b[3]\n" - "udot z22.s, z14.b, z1.b[3]\n" - "udot z26.s, z14.b, z2.b[3]\n" - "udot z30.s, z14.b, z3.b[3]\n" - "udot z19.s, z15.b, z0.b[3]\n" - "udot z23.s, z15.b, z1.b[3]\n" - "udot z27.s, z15.b, z2.b[3]\n" - "udot z31.s, z15.b, z3.b[3]\n" - "cbz %[blocks], 5f\n" - "ld1b z8.b, p7/z, [%[b_ptr0]]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "udot z16.s, z8.b, z4.b[0]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "udot z20.s, z8.b, z5.b[0]\n" - "udot z24.s, z8.b, z6.b[0]\n" - "udot z28.s, z8.b, z7.b[0]\n" - "udot z17.s, z9.b, z4.b[0]\n" - "udot z21.s, z9.b, z5.b[0]\n" - "udot z25.s, z9.b, z6.b[0]\n" - "udot z29.s, z9.b, z7.b[0]\n" - "udot z18.s, z10.b, z4.b[0]\n" - "udot z22.s, z10.b, z5.b[0]\n" - "udot z26.s, z10.b, z6.b[0]\n" - "udot z30.s, z10.b, z7.b[0]\n" - "udot z19.s, z11.b, z4.b[0]\n" - "udot z23.s, z11.b, z5.b[0]\n" - "udot z27.s, z11.b, z6.b[0]\n" - "udot z31.s, z11.b, z7.b[0]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "udot z16.s, z12.b, z4.b[1]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "udot z20.s, z12.b, z5.b[1]\n" - "udot z24.s, z12.b, z6.b[1]\n" - "udot z28.s, z12.b, z7.b[1]\n" - "udot z17.s, z13.b, z4.b[1]\n" - "udot z21.s, z13.b, z5.b[1]\n" - "udot z25.s, z13.b, z6.b[1]\n" - "udot z29.s, z13.b, z7.b[1]\n" - "udot z18.s, z14.b, z4.b[1]\n" - "udot z22.s, z14.b, z5.b[1]\n" - "udot z26.s, z14.b, z6.b[1]\n" - "udot z30.s, z14.b, z7.b[1]\n" - "udot z19.s, z15.b, z4.b[1]\n" - "udot z23.s, z15.b, z5.b[1]\n" - "udot z27.s, z15.b, z6.b[1]\n" - "udot z31.s, z15.b, z7.b[1]\n" - "b.eq 5f\n" - "addvl %[b_ptr0], %[b_ptr0], #16\n" - "subs %[blocks], %[blocks], #0x1\n" - "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n" - "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n" - "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n" - "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n" - "udot z16.s, z8.b, z4.b[2]\n" - "udot z20.s, z8.b, z5.b[2]\n" - "udot z24.s, z8.b, z6.b[2]\n" - "udot z28.s, z8.b, z7.b[2]\n" - "udot z17.s, z9.b, z4.b[2]\n" - "udot z21.s, z9.b, z5.b[2]\n" - "udot z25.s, z9.b, z6.b[2]\n" - "udot z29.s, z9.b, z7.b[2]\n" - "udot z18.s, z10.b, z4.b[2]\n" - "udot z22.s, z10.b, z5.b[2]\n" - "udot z26.s, z10.b, z6.b[2]\n" - "udot z30.s, z10.b, z7.b[2]\n" - "udot z19.s, z11.b, z4.b[2]\n" - "udot z23.s, z11.b, z5.b[2]\n" - "udot z27.s, z11.b, z6.b[2]\n" - "udot z31.s, z11.b, z7.b[2]\n" - "b.eq 5f\n" - "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n" - "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n" - "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n" - "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n" - "udot z16.s, z12.b, z4.b[3]\n" - "udot z20.s, z12.b, z5.b[3]\n" - "udot z24.s, z12.b, z6.b[3]\n" - "udot z28.s, z12.b, z7.b[3]\n" - "udot z17.s, z13.b, z4.b[3]\n" - "udot z21.s, z13.b, z5.b[3]\n" - "udot z25.s, z13.b, z6.b[3]\n" - "udot z29.s, z13.b, z7.b[3]\n" - "udot z18.s, z14.b, z4.b[3]\n" - "udot z22.s, z14.b, z5.b[3]\n" - "udot z26.s, z14.b, z6.b[3]\n" - "udot z30.s, z14.b, z7.b[3]\n" - "udot z19.s, z15.b, z4.b[3]\n" - "udot z23.s, z15.b, z5.b[3]\n" - "udot z27.s, z15.b, z6.b[3]\n" - "udot z31.s, z15.b, z7.b[3]\n" - "5:\n" - "st1w z16.s, p0, [%[c_ptr0]]\n" - "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" - "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" - "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" - "addvl %[c_ptr0], %[c_ptr0], #4\n" - "st1w z20.s, p0, [c_ptr1]\n" - "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" - "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" - "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" - "st1w z24.s, p0, [c_ptr2]\n" - "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" - "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" - "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" - "st1w z28.s, p0, [c_ptr3]\n" - "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" - "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" - "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) - : [width] "r" (width), [accumulate] "r" (static_cast(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers) - : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" - ); - break; - } - - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp new file mode 100644 index 0000000000..af9de4a6eb --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#ifdef __ARM_FEATURE_SVE + +#include "../std_transforms_sve.hpp" + +#define ARGLIST \ + unsigned int, const unsigned int *, \ + IndirectInputArg, \ + size_t, size_t, \ + const uint8_t *, \ + IndirectOutputArg, \ + const uint32_t *, Activation, bool + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_u8u32_dot_6x4VL( ARGLIST ); + +class cls_sve_hybrid_u8u32_dot_6x4VL +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)( ARGLIST ); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 6; + } + + static unsigned int out_width() + { + return get_vector_length() * 4; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_u8u32_dot_6x4VL; + + cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#undef ARGLIST +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp new file mode 100644 index 0000000000..fc8ce636dd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp @@ -0,0 +1,1904 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void sve_hybrid_u8u32_dot_6x4VL ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const uint32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + "ptrue p5.b\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 61f\n" + "cmp %x[M], #0x4\n" + "bgt 49f\n" + "beq 37f\n" + "cmp %x[M], #0x2\n" + "bgt 25f\n" + "beq 13f\n" + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 2f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "b 3f\n" + "2:" // Height 1: setup direct output + "mov x13, %x[output_ptr]\n" + "3:" // Height 1: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 4f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "b 5f\n" + "4:" // Height 1: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "5:" // Height 1: setup done + "mov x12, #0x0\n" + "6:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "cbnz x12, 8f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x10, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x11, #0x10\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "cmp x11, #0x10\n" + "udot z10.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "udot z10.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "udot z10.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "ble 11f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z9.s, z7.b, z0.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "udot z10.s, z6.b, z0.b[1]\n" + "addvl x14, x14, #4\n" + "udot z11.s, z7.b, z0.b[1]\n" + "ble 11f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z9.s, z7.b, z0.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "udot z10.s, z6.b, z0.b[2]\n" + "addvl x14, x14, #4\n" + "udot z11.s, z7.b, z0.b[2]\n" + "ble 11f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "11:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 6b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "12:" // Height 1: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 3b\n" + "b 74f\n" + "13:" // Height 2 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 14f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "add x9, x9, x19, LSL #2\n" + "b 15f\n" + "14:" // Height 2: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "15:" // Height 2: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 16f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "b 17f\n" + "16:" // Height 2: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "17:" // Height 2: setup done + "mov x12, #0x0\n" + "18:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x12, 20f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "20:" // Height 2: input setup done + "cmp x11, #0x10\n" + "ble 22f\n" + "21:" // Height 2: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "cmp x11, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "udot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "udot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "bgt 21b\n" + "22:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "ble 23f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "ble 23f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "ble 23f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "23:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 18b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "24:" // Height 2: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 15b\n" + "b 74f\n" + "25:" // Height 3 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 26f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 27f\n" + "26:" // Height 3: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "27:" // Height 3: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 28f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "b 29f\n" + "28:" // Height 3: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "29:" // Height 3: setup done + "mov x12, #0x0\n" + "30:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x12, 32f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "b 32f\n" + "31:" // Height 3: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "32:" // Height 3: input setup done + "cmp x11, #0x10\n" + "ble 34f\n" + "33:" // Height 3: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x26, x26, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "cmp x11, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "udot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "bgt 33b\n" + "34:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "add x26, x26, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "udot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "ble 35f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "ble 35f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "ble 35f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "35:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 30b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "36:" // Height 3: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 27b\n" + "b 74f\n" + "37:" // Height 4 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 38f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 39f\n" + "38:" // Height 4: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "39:" // Height 4: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 40f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "b 41f\n" + "40:" // Height 4: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "41:" // Height 4: setup done + "mov x12, #0x0\n" + "42:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 43f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x12, 44f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 44f\n" + "43:" // Height 4: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "44:" // Height 4: input setup done + "cmp x11, #0x10\n" + "ble 46f\n" + "45:" // Height 4: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x24, x24, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "udot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "udot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "udot z22.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "udot z23.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "udot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "udot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "udot z22.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "udot z23.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "udot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "udot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "udot z22.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "udot z23.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "udot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "udot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z22.s, z6.b, z3.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "udot z23.s, z7.b, z3.b[3]\n" + "bgt 45b\n" + "46:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" + "add x24, x24, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "udot z17.s, z7.b, z2.b[0]\n" + "udot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "udot z22.s, z6.b, z3.b[0]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "udot z23.s, z7.b, z3.b[0]\n" + "ble 47f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "udot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "udot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "udot z22.s, z6.b, z3.b[1]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "udot z23.s, z7.b, z3.b[1]\n" + "ble 47f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "udot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "udot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "udot z22.s, z6.b, z3.b[2]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "udot z23.s, z7.b, z3.b[2]\n" + "ble 47f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "udot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "udot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z22.s, z6.b, z3.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "udot z23.s, z7.b, z3.b[3]\n" + "47:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 42b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "48:" // Height 4: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 39b\n" + "b 74f\n" + "49:" // Height 5 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 50f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 51f\n" + "50:" // Height 5: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "51:" // Height 5: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 52f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 53f\n" + "52:" // Height 5: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "53:" // Height 5: setup done + "mov x12, #0x0\n" + "54:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 55f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x12, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "b 56f\n" + "55:" // Height 5: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "56:" // Height 5: input setup done + "cmp x11, #0x10\n" + "ble 58f\n" + "57:" // Height 5: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x22, x22, #0x10\n" + "udot z20.s, z6.b, z3.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "udot z24.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "udot z21.s, z7.b, z3.b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "udot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "udot z10.s, z6.b, z0.b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "udot z22.s, z6.b, z3.b[0]\n" + "udot z26.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "udot z23.s, z7.b, z3.b[0]\n" + "udot z27.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "udot z20.s, z6.b, z3.b[1]\n" + "udot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "udot z21.s, z7.b, z3.b[1]\n" + "udot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "udot z22.s, z6.b, z3.b[1]\n" + "udot z26.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "udot z23.s, z7.b, z3.b[1]\n" + "udot z27.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "udot z20.s, z6.b, z3.b[2]\n" + "udot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "udot z21.s, z7.b, z3.b[2]\n" + "udot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "udot z22.s, z6.b, z3.b[2]\n" + "udot z26.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "udot z23.s, z7.b, z3.b[2]\n" + "udot z27.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "udot z20.s, z6.b, z3.b[3]\n" + "udot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "udot z21.s, z7.b, z3.b[3]\n" + "udot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z22.s, z6.b, z3.b[3]\n" + "udot z26.s, z6.b, z4.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "udot z23.s, z7.b, z3.b[3]\n" + "udot z27.s, z7.b, z4.b[3]\n" + "bgt 57b\n" + "58:" // Height 5: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "add x22, x22, #0x10\n" + "udot z17.s, z7.b, z2.b[0]\n" + "udot z20.s, z6.b, z3.b[0]\n" + "udot z24.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z21.s, z7.b, z3.b[0]\n" + "udot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "udot z22.s, z6.b, z3.b[0]\n" + "udot z26.s, z6.b, z4.b[0]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "udot z23.s, z7.b, z3.b[0]\n" + "udot z27.s, z7.b, z4.b[0]\n" + "ble 59f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "udot z20.s, z6.b, z3.b[1]\n" + "udot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "udot z21.s, z7.b, z3.b[1]\n" + "udot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "udot z22.s, z6.b, z3.b[1]\n" + "udot z26.s, z6.b, z4.b[1]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "udot z23.s, z7.b, z3.b[1]\n" + "udot z27.s, z7.b, z4.b[1]\n" + "ble 59f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "udot z20.s, z6.b, z3.b[2]\n" + "udot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "udot z21.s, z7.b, z3.b[2]\n" + "udot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "udot z22.s, z6.b, z3.b[2]\n" + "udot z26.s, z6.b, z4.b[2]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "udot z23.s, z7.b, z3.b[2]\n" + "udot z27.s, z7.b, z4.b[2]\n" + "ble 59f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "udot z20.s, z6.b, z3.b[3]\n" + "udot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "udot z21.s, z7.b, z3.b[3]\n" + "udot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z22.s, z6.b, z3.b[3]\n" + "udot z26.s, z6.b, z4.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "udot z23.s, z7.b, z3.b[3]\n" + "udot z27.s, z7.b, z4.b[3]\n" + "59:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 54b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "60:" // Height 5: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 51b\n" + "b 74f\n" + "61:" // Height 6 + "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "tbz %x[flags], #2, 62f\n" + "ldr x13, [%x[output_ptr], #0x0]\n" + "add x13, x13, x19, LSL #2\n" + "ldr x9, [%x[output_ptr], #0x8]\n" + "ldr x27, [%x[output_ptr], #0x10]\n" + "add x9, x9, x19, LSL #2\n" + "ldr x25, [%x[output_ptr], #0x18]\n" + "ldr x23, [%x[output_ptr], #0x20]\n" + "add x27, x27, x19, LSL #2\n" + "ldr x21, [%x[output_ptr], #0x28]\n" + "add %x[output_ptr], %x[output_ptr], #0x30\n" + "add x25, x25, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" + "b 63f\n" + "62:" // Height 6: setup direct output + "mov x13, %x[output_ptr]\n" + "add x9, x13, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x23, x25, x19, LSL #2\n" + "add x21, x23, x19, LSL #2\n" + "add %x[output_ptr], x21, x19, LSL #2\n" + "63:" // Height 6: Column loop + "mov x19, #0x0\n" + "whilelt p4.s, x19, x15\n" + "incw x19\n" + "whilelt p3.s, x19, x15\n" + "incw x19\n" + "whilelt p2.s, x19, x15\n" + "incw x19\n" + "whilelt p1.s, x19, x15\n" + "tbz %x[flags], #0, 64f\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x9]\n" + "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x27]\n" + "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x25]\n" + "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x23]\n" + "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x21]\n" + "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 65f\n" + "64:" // Height 6: no accumulate + "mov z8.s, #0x0\n" + "mov z9.s, #0x0\n" + "mov z10.s, #0x0\n" + "mov z11.s, #0x0\n" + "mov z12.s, #0x0\n" + "mov z13.s, #0x0\n" + "mov z14.s, #0x0\n" + "mov z15.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z18.s, #0x0\n" + "mov z19.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z25.s, #0x0\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z28.s, #0x0\n" + "mov z29.s, #0x0\n" + "mov z30.s, #0x0\n" + "mov z31.s, #0x0\n" + "65:" // Height 6: setup done + "mov x12, #0x0\n" + "66:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w11, [x20, x12, LSL #0x2]\n" + "tbz %x[flags], #3, 67f\n" + "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x10, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x12, 68f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x10, x10, x19\n" + "add x28, x28, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 68f\n" + "67:" // Height 6: setup direct input + "mov x10, %x[input_ptr]\n" + "add x28, x10, x19\n" + "add x26, x28, x19\n" + "add x24, x26, x19\n" + "add x22, x24, x19\n" + "add x20, x22, x19\n" + "68:" // Height 6: input setup done + "cmp x11, #0x10\n" + "ble 70f\n" + "69:" // Height 6: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "sub x11, x11, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z5.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "udot z20.s, z6.b, z3.b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "add x20, x20, #0x10\n" + "udot z24.s, z6.b, z4.b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "cmp x11, #0x10\n" + "udot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z17.s, z7.b, z2.b[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "udot z21.s, z7.b, z3.b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "udot z25.s, z7.b, z4.b[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "udot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "udot z10.s, z6.b, z0.b[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "udot z22.s, z6.b, z3.b[0]\n" + "udot z26.s, z6.b, z4.b[0]\n" + "udot z30.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "udot z23.s, z7.b, z3.b[0]\n" + "udot z27.s, z7.b, z4.b[0]\n" + "udot z31.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "udot z20.s, z6.b, z3.b[1]\n" + "udot z24.s, z6.b, z4.b[1]\n" + "udot z28.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "udot z21.s, z7.b, z3.b[1]\n" + "udot z25.s, z7.b, z4.b[1]\n" + "udot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "udot z22.s, z6.b, z3.b[1]\n" + "udot z26.s, z6.b, z4.b[1]\n" + "udot z30.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "udot z23.s, z7.b, z3.b[1]\n" + "udot z27.s, z7.b, z4.b[1]\n" + "udot z31.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "udot z20.s, z6.b, z3.b[2]\n" + "udot z24.s, z6.b, z4.b[2]\n" + "udot z28.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "udot z21.s, z7.b, z3.b[2]\n" + "udot z25.s, z7.b, z4.b[2]\n" + "udot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "udot z22.s, z6.b, z3.b[2]\n" + "udot z26.s, z6.b, z4.b[2]\n" + "udot z30.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "udot z23.s, z7.b, z3.b[2]\n" + "udot z27.s, z7.b, z4.b[2]\n" + "udot z31.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "udot z20.s, z6.b, z3.b[3]\n" + "udot z24.s, z6.b, z4.b[3]\n" + "udot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "udot z21.s, z7.b, z3.b[3]\n" + "udot z25.s, z7.b, z4.b[3]\n" + "udot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z22.s, z6.b, z3.b[3]\n" + "udot z26.s, z6.b, z4.b[3]\n" + "udot z30.s, z6.b, z5.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "udot z23.s, z7.b, z3.b[3]\n" + "udot z27.s, z7.b, z4.b[3]\n" + "udot z31.s, z7.b, z5.b[3]\n" + "bgt 69b\n" + "70:" // Height 6: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x14]\n" + "whilelt p0.b, XZR, x11\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x10]\n" + "udot z8.s, z6.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x28]\n" + "add x10, x10, #0x10\n" + "udot z9.s, z7.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" + "add x28, x28, #0x10\n" + "udot z12.s, z6.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "add x26, x26, #0x10\n" + "udot z16.s, z6.b, z2.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "add x24, x24, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z5.b }, p0/Z, [x20]\n" + "add x22, x22, #0x10\n" + "udot z20.s, z6.b, z3.b[0]\n" + "add x20, x20, #0x10\n" + "udot z17.s, z7.b, z2.b[0]\n" + "udot z24.s, z6.b, z4.b[0]\n" + "udot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z21.s, z7.b, z3.b[0]\n" + "udot z25.s, z7.b, z4.b[0]\n" + "udot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[0]\n" + "udot z14.s, z6.b, z1.b[0]\n" + "udot z18.s, z6.b, z2.b[0]\n" + "udot z22.s, z6.b, z3.b[0]\n" + "udot z26.s, z6.b, z4.b[0]\n" + "udot z30.s, z6.b, z5.b[0]\n" + "udot z11.s, z7.b, z0.b[0]\n" + "udot z15.s, z7.b, z1.b[0]\n" + "udot z19.s, z7.b, z2.b[0]\n" + "udot z23.s, z7.b, z3.b[0]\n" + "udot z27.s, z7.b, z4.b[0]\n" + "udot z31.s, z7.b, z5.b[0]\n" + "ble 71f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[1]\n" + "udot z16.s, z6.b, z2.b[1]\n" + "udot z20.s, z6.b, z3.b[1]\n" + "udot z24.s, z6.b, z4.b[1]\n" + "udot z28.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[1]\n" + "udot z13.s, z7.b, z1.b[1]\n" + "udot z17.s, z7.b, z2.b[1]\n" + "udot z21.s, z7.b, z3.b[1]\n" + "udot z25.s, z7.b, z4.b[1]\n" + "udot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[1]\n" + "udot z14.s, z6.b, z1.b[1]\n" + "udot z18.s, z6.b, z2.b[1]\n" + "udot z22.s, z6.b, z3.b[1]\n" + "udot z26.s, z6.b, z4.b[1]\n" + "udot z30.s, z6.b, z5.b[1]\n" + "udot z11.s, z7.b, z0.b[1]\n" + "udot z15.s, z7.b, z1.b[1]\n" + "udot z19.s, z7.b, z2.b[1]\n" + "udot z23.s, z7.b, z3.b[1]\n" + "udot z27.s, z7.b, z4.b[1]\n" + "udot z31.s, z7.b, z5.b[1]\n" + "ble 71f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "subs x11, x11, #0x4\n" + "udot z12.s, z6.b, z1.b[2]\n" + "udot z16.s, z6.b, z2.b[2]\n" + "udot z20.s, z6.b, z3.b[2]\n" + "udot z24.s, z6.b, z4.b[2]\n" + "udot z28.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[2]\n" + "udot z13.s, z7.b, z1.b[2]\n" + "udot z17.s, z7.b, z2.b[2]\n" + "udot z21.s, z7.b, z3.b[2]\n" + "udot z25.s, z7.b, z4.b[2]\n" + "udot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[2]\n" + "udot z14.s, z6.b, z1.b[2]\n" + "udot z18.s, z6.b, z2.b[2]\n" + "udot z22.s, z6.b, z3.b[2]\n" + "udot z26.s, z6.b, z4.b[2]\n" + "udot z30.s, z6.b, z5.b[2]\n" + "udot z11.s, z7.b, z0.b[2]\n" + "udot z15.s, z7.b, z1.b[2]\n" + "udot z19.s, z7.b, z2.b[2]\n" + "udot z23.s, z7.b, z3.b[2]\n" + "udot z27.s, z7.b, z4.b[2]\n" + "udot z31.s, z7.b, z5.b[2]\n" + "ble 71f\n" + "ld1b { z6.b }, p5/Z, [x14]\n" + "udot z8.s, z6.b, z0.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "udot z12.s, z6.b, z1.b[3]\n" + "udot z16.s, z6.b, z2.b[3]\n" + "udot z20.s, z6.b, z3.b[3]\n" + "udot z24.s, z6.b, z4.b[3]\n" + "udot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "udot z9.s, z7.b, z0.b[3]\n" + "udot z13.s, z7.b, z1.b[3]\n" + "udot z17.s, z7.b, z2.b[3]\n" + "udot z21.s, z7.b, z3.b[3]\n" + "udot z25.s, z7.b, z4.b[3]\n" + "udot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "addvl x14, x14, #4\n" + "udot z10.s, z6.b, z0.b[3]\n" + "udot z14.s, z6.b, z1.b[3]\n" + "udot z18.s, z6.b, z2.b[3]\n" + "udot z22.s, z6.b, z3.b[3]\n" + "udot z26.s, z6.b, z4.b[3]\n" + "udot z30.s, z6.b, z5.b[3]\n" + "udot z11.s, z7.b, z0.b[3]\n" + "udot z15.s, z7.b, z1.b[3]\n" + "udot z19.s, z7.b, z2.b[3]\n" + "udot z23.s, z7.b, z3.b[3]\n" + "udot z27.s, z7.b, z4.b[3]\n" + "udot z31.s, z7.b, z5.b[3]\n" + "71:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x10, #0x80]\n" + "add x12, x12, #0x1\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "cmp x12, x19\n" + "bne 66b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "st1w { z8.s }, p4, [x13]\n" + "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "st1w { z12.s }, p4, [x9]\n" + "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "st1w { z16.s }, p4, [x27]\n" + "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" + "st1w { z20.s }, p4, [x25]\n" + "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" + "addvl x25, x25, #4\n" + "st1w { z24.s }, p4, [x23]\n" + "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "st1w { z28.s }, p4, [x21]\n" + "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" + "addvl x21, x21, #4\n" + "72:" // Height 6: Writeback done + "mov x19, #0x0\n" + "incw x19, ALL, MUL #4\n" + "subs x15, x15, x19\n" + "bgt 63b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 74f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 73f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "73:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "74:" // Exit + + : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp deleted file mode 100644 index 43107e45fa..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include "../bfloat.hpp" -#include "../std_transforms_sve.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int); - -class interleaved_bf16fp32_dot_3VLx8 { -public: - typedef bfloat16 operand_type; - typedef float result_type; - - typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return get_vector_length() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 2; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8; - - interleaved_bf16fp32_dot_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp deleted file mode 100644 index 7e20ed0971..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; - - K /= 2; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 2; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL; + + cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp new file mode 100644 index 0000000000..adee900337 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "../../bfloat.hpp" +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 4; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8; - - interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp deleted file mode 100644 index 16cc69b2a6..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include "../../bfloat.hpp" -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; - - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 4; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL; + + cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp new file mode 100644 index 0000000000..e43404e608 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include "../../bfloat.hpp" +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 4; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 1; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_fp16_mla_3VLx8; - - interleaved_fp16_mla_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp deleted file mode 100644 index f2050cbd56..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 1; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_fp16_mla_8x3VL; + + cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp new file mode 100644 index 0000000000..46b8770409 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 1; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_fp32_mla_3VLx8; - - interleaved_fp32_mla_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp deleted file mode 100644 index cd178c478a..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 1; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_fp32_mla_8x3VL; + + cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp new file mode 100644 index 0000000000..1e05a308b5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 2; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_fp32_mmla_3VLx8; - - interleaved_fp32_mmla_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp deleted file mode 100644 index a404ae9c82..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - - -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - K /= 2; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 2; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_fp32_mmla_8x3VL; + + cls_sve_interleaved_fp32_mmla_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp new file mode 100644 index 0000000000..39daf0ff20 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_fp32_mmla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb -#include "../std_transforms_sve.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void sve_interleaved_s8s32_dot_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int); - -class interleaved_s8s32_dot_3VLx8 { -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return get_vector_length() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 4; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_s8s32_dot_3VLx8; - - interleaved_s8s32_dot_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp deleted file mode 100644 index cdc70705c5..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class cls_sve_interleaved_s8s32_dot_8x3VL { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 4; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + + kern_type kernel=sve_interleaved_s8s32_dot_8x3VL; + + cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp new file mode 100644 index 0000000000..674c2400bf --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 4; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb -#include "../std_transforms_sve.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int); - -class interleaved_s8s32_mmla_3VLx8 { -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return get_vector_length() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 8; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8; - - interleaved_s8s32_mmla_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp deleted file mode 100644 index cde9ec32e9..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class cls_sve_interleaved_s8s32_mmla_8x3VL { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + + kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL; + + cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp new file mode 100644 index 0000000000..578aa01732 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb -#include "../std_transforms_sve.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - -class interleaved_u8u32_dot_3VLx8 { -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return get_vector_length() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 4; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_u8u32_dot_3VLx8; - - interleaved_u8u32_dot_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp deleted file mode 100644 index 6626f8463b..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class cls_sve_interleaved_u8u32_dot_8x3VL { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 4; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + + kern_type kernel=sve_interleaved_u8u32_dot_8x3VL; + + cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp new file mode 100644 index 0000000000..891869c767 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + K /= 4; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb -#include "../std_transforms_sve.hpp" - -namespace arm_gemm { - -// Actual kernel implementations -void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - -class interleaved_u8u32_mmla_3VLx8 { -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - - /* Kernel blocking parameters */ - static unsigned int out_width() - { - return get_vector_length() * 3; - } - - static unsigned int out_height() - { - return 8; - } - - static unsigned int k_unroll() - { - return 8; - } - - // Use the standard fixed size transforms. - StdTransformsSVE transforms = {}; - - kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8; - - interleaved_u8u32_mmla_3VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp deleted file mode 100644 index 81a1dbcf51..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include -#include "../../asmlib.hpp" - -namespace arm_gemm { - -void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - K /= 8; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; - - for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class cls_sve_interleaved_u8u32_mmla_8x3VL { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + StdTransformsSVE transforms_quantized = {}; + + kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL; + + cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp new file mode 100644 index 0000000000..fa08a9d091 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb() * 1; - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return true; - } - - static constexpr bool supports_activation() - { - return true; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8; - - smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp deleted file mode 100644 index 5501688054..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp +++ /dev/null @@ -1,18807 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" - - -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_smallK_hybrid_fp32_mla_1VLx8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) { - const long loops_count = iceildiv(N, (int)get_vector_length()) - 1; - const long ldab = lda * sizeof(float); - const long ldcb = ldc * sizeof(float); - const long odd_depth = (K % 4) ? (K % 4) : 4; - const long last_width = N - (loops_count * get_vector_length()); - float nullbias[64]; - if (!bias) { - memset(nullbias, 0, (1 * get_vector_length() * sizeof(float))); - } - float minval = - static_cast(std::numeric_limits::infinity()); - float maxval = static_cast(std::numeric_limits::infinity()); - const float * const minptr = &minval; - const float * const maxptr = &maxval; - - switch(act.type) - { - default: - case Activation::Type::None: - break; - case Activation::Type::BoundedReLU: - maxval = static_cast(act.param1); - /* fall through */ - case Activation::Type::ReLU: - minval = 0.0f; - break; - } - - for (int y0=0; y0() * 1*sizeof(float) : 0; - const float *a_ptr0 = A + (y0 * lda); - - float *c_ptr0 = C + (y0 * ldc); - - switch(K) { - case 1: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p6/z, [a_ptr1]\n" - "ld1rqw z2.s, p6/z, [a_ptr2]\n" - "ld1rqw z3.s, p6/z, [a_ptr3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4]\n" - "ld1rqw z5.s, p6/z, [a_ptr5]\n" - "ld1rqw z6.s, p6/z, [a_ptr6]\n" - "ld1rqw z7.s, p6/z, [a_ptr7]\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "mov z26.d, z24.d\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z27.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z29.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "mov z26.d, z24.d\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z27.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z29.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 2: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p6/z, [a_ptr1]\n" - "ld1rqw z2.s, p6/z, [a_ptr2]\n" - "ld1rqw z3.s, p6/z, [a_ptr3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4]\n" - "ld1rqw z5.s, p6/z, [a_ptr5]\n" - "ld1rqw z6.s, p6/z, [a_ptr6]\n" - "ld1rqw z7.s, p6/z, [a_ptr7]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z26.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z27.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z28.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "mov z30.d, z24.d\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "mov z26.d, z24.d\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z27.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z29.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 3: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p6/z, [a_ptr1]\n" - "ld1rqw z2.s, p6/z, [a_ptr2]\n" - "ld1rqw z3.s, p6/z, [a_ptr3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4]\n" - "ld1rqw z5.s, p6/z, [a_ptr5]\n" - "ld1rqw z6.s, p6/z, [a_ptr6]\n" - "ld1rqw z7.s, p6/z, [a_ptr7]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z26.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z27.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z28.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "mov z30.d, z24.d\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z26.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z27.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z28.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "mov z30.d, z24.d\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 4: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p6/z, [a_ptr1]\n" - "ld1rqw z2.s, p6/z, [a_ptr2]\n" - "ld1rqw z3.s, p6/z, [a_ptr3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4]\n" - "ld1rqw z5.s, p6/z, [a_ptr5]\n" - "ld1rqw z6.s, p6/z, [a_ptr6]\n" - "ld1rqw z7.s, p6/z, [a_ptr7]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "mov z26.d, z24.d\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z27.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z29.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z25.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "st1w z28.s, p7, [c_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl c_ptr4, c_ptr4, #1\n" - "mov z26.d, z24.d\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z27.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "mov z28.d, z24.d\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr6, c_ptr6, #1\n" - "mov z30.d, z24.d\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "mov z25.d, z24.d\n" - "mov z26.d, z24.d\n" - "mov z27.d, z24.d\n" - "mov z28.d, z24.d\n" - "mov z29.d, z24.d\n" - "mov z30.d, z24.d\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 5: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "mov z27.d, z24.d\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "mov z27.d, z24.d\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 6: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z27.d, z24.d\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "mov z27.d, z24.d\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 7: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z27.d, z24.d\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 8: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 9: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 10: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 11: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 12: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 13: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 14: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 15: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 16: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 17: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 18: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #2\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 19: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #3\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 20: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #4\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 21: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "addvl %[b_ptr0], %[b_ptr0], #5\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 22: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "addvl %[b_ptr0], %[b_ptr0], #6\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - case 23: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #7\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - default: - case 24: - __asm __volatile ( - "a_ptr1 .req X0\n" - "a_ptr2 .req X1\n" - "a_ptr3 .req X2\n" - "a_ptr4 .req X3\n" - "a_ptr5 .req X4\n" - "a_ptr6 .req X5\n" - "a_ptr7 .req X6\n" - "c_ptr1 .req X7\n" - "c_ptr2 .req X8\n" - "c_ptr3 .req X9\n" - "c_ptr4 .req X10\n" - "c_ptr5 .req X11\n" - "c_ptr6 .req X12\n" - "c_ptr7 .req X13\n" - "add a_ptr1, %[a_ptr0], %[lda]\n" - "add c_ptr1, %[c_ptr0], %[ldc]\n" - "add a_ptr2, a_ptr1, %[lda]\n" - "add c_ptr2, c_ptr1, %[ldc]\n" - "add a_ptr3, a_ptr2, %[lda]\n" - "add c_ptr3, c_ptr2, %[ldc]\n" - "add a_ptr4, a_ptr3, %[lda]\n" - "add c_ptr4, c_ptr3, %[ldc]\n" - "add a_ptr5, a_ptr4, %[lda]\n" - "add c_ptr5, c_ptr4, %[ldc]\n" - "add a_ptr6, a_ptr5, %[lda]\n" - "add c_ptr6, c_ptr5, %[ldc]\n" - "add a_ptr7, a_ptr6, %[lda]\n" - "add c_ptr7, c_ptr6, %[ldc]\n" - "cbz %[oob_rows], 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr7, %[c_ptr0], #0x0\n" - "add a_ptr7, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr6, %[c_ptr0], #0x0\n" - "add a_ptr6, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr5, %[c_ptr0], #0x0\n" - "add a_ptr5, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr4, %[c_ptr0], #0x0\n" - "add a_ptr4, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr3, %[c_ptr0], #0x0\n" - "add a_ptr3, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr2, %[c_ptr0], #0x0\n" - "add a_ptr2, %[a_ptr0], #0x0\n" - "b.eq 1f\n" - "subs %[oob_rows], %[oob_rows], #0x1\n" - "add c_ptr1, %[c_ptr0], #0x0\n" - "add a_ptr1, %[a_ptr0], #0x0\n" - "1:\n" - "ptrue p7.s\n" - "whilelt p6.s, %[temp], %[odd_depth]\n" - "whilelt p0.s, %[temp], %[last_width]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "cbz %[loops], 2f\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "subs %[loops], %[loops], #0x1\n" - "mov z25.d, z24.d\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z26.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z27.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z28.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z29.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z30.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z31.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b.eq 3f\n" - "4:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "subs %[loops], %[loops], #0x1\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p7/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z27.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b.ne 4b\n" - "3:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p7, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "st1w z25.s, p7, [c_ptr1]\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "st1w z26.s, p7, [c_ptr2]\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "mov z25.d, z24.d\n" - "st1w z27.s, p7, [c_ptr3]\n" - "mov z26.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z27.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "st1w z28.s, p7, [c_ptr4]\n" - "mov z28.d, z24.d\n" - "addvl c_ptr1, c_ptr1, #1\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "st1w z29.s, p7, [c_ptr5]\n" - "mov z29.d, z24.d\n" - "addvl c_ptr2, c_ptr2, #1\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "st1w z30.s, p7, [c_ptr6]\n" - "mov z30.d, z24.d\n" - "addvl c_ptr3, c_ptr3, #1\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "st1w z31.s, p7, [c_ptr7]\n" - "mov z31.d, z24.d\n" - "addvl c_ptr4, c_ptr4, #1\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "addvl c_ptr5, c_ptr5, #1\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "addvl c_ptr6, c_ptr6, #1\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "addvl c_ptr7, c_ptr7, #1\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "b 5f\n" - "2:\n" - "ld1w z24.s, p0/z, [%[biasptr]]\n" - "add %[biasptr], %[biasptr], %[biasinc]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" - "ld1rqw z1.s, p7/z, [a_ptr1]\n" - "mov z25.d, z24.d\n" - "ld1rqw z2.s, p7/z, [a_ptr2]\n" - "mov z26.d, z24.d\n" - "ld1rqw z3.s, p7/z, [a_ptr3]\n" - "mov z27.d, z24.d\n" - "ld1rqw z4.s, p7/z, [a_ptr4]\n" - "mov z28.d, z24.d\n" - "ld1rqw z5.s, p7/z, [a_ptr5]\n" - "mov z29.d, z24.d\n" - "ld1rqw z6.s, p7/z, [a_ptr6]\n" - "mov z30.d, z24.d\n" - "ld1rqw z7.s, p7/z, [a_ptr7]\n" - "mov z31.d, z24.d\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "ld1w z16.s, p7/z, [%[b_ptr0]]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" - "fmla z24.s, z16.s, z0.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" - "fmla z25.s, z16.s, z1.s[0]\n" - "addvl %[b_ptr0], %[b_ptr0], #8\n" - "fmla z26.s, z16.s, z2.s[0]\n" - "fmla z27.s, z16.s, z3.s[0]\n" - "fmla z28.s, z16.s, z4.s[0]\n" - "fmla z29.s, z16.s, z5.s[0]\n" - "fmla z30.s, z16.s, z6.s[0]\n" - "fmla z31.s, z16.s, z7.s[0]\n" - "fmla z24.s, z17.s, z0.s[1]\n" - "fmla z25.s, z17.s, z1.s[1]\n" - "fmla z26.s, z17.s, z2.s[1]\n" - "fmla z27.s, z17.s, z3.s[1]\n" - "fmla z28.s, z17.s, z4.s[1]\n" - "fmla z29.s, z17.s, z5.s[1]\n" - "fmla z30.s, z17.s, z6.s[1]\n" - "fmla z31.s, z17.s, z7.s[1]\n" - "fmla z24.s, z18.s, z0.s[2]\n" - "fmla z25.s, z18.s, z1.s[2]\n" - "fmla z26.s, z18.s, z2.s[2]\n" - "fmla z27.s, z18.s, z3.s[2]\n" - "fmla z28.s, z18.s, z4.s[2]\n" - "fmla z29.s, z18.s, z5.s[2]\n" - "fmla z30.s, z18.s, z6.s[2]\n" - "fmla z31.s, z18.s, z7.s[2]\n" - "fmla z24.s, z19.s, z0.s[3]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" - "fmla z25.s, z19.s, z1.s[3]\n" - "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" - "fmla z26.s, z19.s, z2.s[3]\n" - "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" - "fmla z27.s, z19.s, z3.s[3]\n" - "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" - "fmla z28.s, z19.s, z4.s[3]\n" - "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" - "fmla z29.s, z19.s, z5.s[3]\n" - "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" - "fmla z30.s, z19.s, z6.s[3]\n" - "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" - "fmla z31.s, z19.s, z7.s[3]\n" - "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" - "fmla z24.s, z20.s, z0.s[0]\n" - "fmla z25.s, z20.s, z1.s[0]\n" - "fmla z26.s, z20.s, z2.s[0]\n" - "fmla z27.s, z20.s, z3.s[0]\n" - "fmla z28.s, z20.s, z4.s[0]\n" - "fmla z29.s, z20.s, z5.s[0]\n" - "fmla z30.s, z20.s, z6.s[0]\n" - "fmla z31.s, z20.s, z7.s[0]\n" - "fmla z24.s, z21.s, z0.s[1]\n" - "fmla z25.s, z21.s, z1.s[1]\n" - "fmla z26.s, z21.s, z2.s[1]\n" - "fmla z27.s, z21.s, z3.s[1]\n" - "fmla z28.s, z21.s, z4.s[1]\n" - "fmla z29.s, z21.s, z5.s[1]\n" - "fmla z30.s, z21.s, z6.s[1]\n" - "fmla z31.s, z21.s, z7.s[1]\n" - "fmla z24.s, z22.s, z0.s[2]\n" - "fmla z25.s, z22.s, z1.s[2]\n" - "fmla z26.s, z22.s, z2.s[2]\n" - "fmla z27.s, z22.s, z3.s[2]\n" - "fmla z28.s, z22.s, z4.s[2]\n" - "fmla z29.s, z22.s, z5.s[2]\n" - "fmla z30.s, z22.s, z6.s[2]\n" - "fmla z31.s, z22.s, z7.s[2]\n" - "fmla z24.s, z23.s, z0.s[3]\n" - "fmla z25.s, z23.s, z1.s[3]\n" - "fmla z26.s, z23.s, z2.s[3]\n" - "fmla z27.s, z23.s, z3.s[3]\n" - "fmla z28.s, z23.s, z4.s[3]\n" - "fmla z29.s, z23.s, z5.s[3]\n" - "fmla z30.s, z23.s, z6.s[3]\n" - "fmla z31.s, z23.s, z7.s[3]\n" - "5:\n" - "ld1rw z22.s, p7/z, [%[minptr]]\n" - "ld1rw z23.s, p7/z, [%[maxptr]]\n" - "fmax z24.s, p7/m, z24.s, z22.s\n" - "fmax z25.s, p7/m, z25.s, z22.s\n" - "fmax z26.s, p7/m, z26.s, z22.s\n" - "fmax z27.s, p7/m, z27.s, z22.s\n" - "fmin z24.s, p7/m, z24.s, z23.s\n" - "fmin z25.s, p7/m, z25.s, z23.s\n" - "fmin z26.s, p7/m, z26.s, z23.s\n" - "fmin z27.s, p7/m, z27.s, z23.s\n" - "st1w z24.s, p0, [%[c_ptr0]]\n" - "fmax z28.s, p7/m, z28.s, z22.s\n" - "addvl %[c_ptr0], %[c_ptr0], #1\n" - "fmax z29.s, p7/m, z29.s, z22.s\n" - "st1w z25.s, p0, [c_ptr1]\n" - "fmax z30.s, p7/m, z30.s, z22.s\n" - "fmin z28.s, p7/m, z28.s, z23.s\n" - "fmax z31.s, p7/m, z31.s, z22.s\n" - "st1w z26.s, p0, [c_ptr2]\n" - "fmin z29.s, p7/m, z29.s, z23.s\n" - "fmin z30.s, p7/m, z30.s, z23.s\n" - "fmin z31.s, p7/m, z31.s, z23.s\n" - "st1w z27.s, p0, [c_ptr3]\n" - "st1w z28.s, p0, [c_ptr4]\n" - "st1w z29.s, p0, [c_ptr5]\n" - "st1w z30.s, p0, [c_ptr6]\n" - "st1w z31.s, p0, [c_ptr7]\n" - ".unreq a_ptr1\n" - ".unreq a_ptr2\n" - ".unreq a_ptr3\n" - ".unreq a_ptr4\n" - ".unreq a_ptr5\n" - ".unreq a_ptr6\n" - ".unreq a_ptr7\n" - ".unreq c_ptr1\n" - ".unreq c_ptr2\n" - ".unreq c_ptr3\n" - ".unreq c_ptr4\n" - ".unreq c_ptr5\n" - ".unreq c_ptr6\n" - ".unreq c_ptr7\n" - : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) - : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" - ); - break; - } - } -} - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp new file mode 100644 index 0000000000..2097d76a54 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_smallK_hybrid_fp32_mla_8x1VL(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); + +class cls_sve_smallK_hybrid_fp32_mla_8x1VL +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return get_vector_length() * 1; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return true; + } + + static constexpr bool supports_activation() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_smallK_hybrid_fp32_mla_8x1VL; + + cls_sve_smallK_hybrid_fp32_mla_8x1VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp new file mode 100644 index 0000000000..e07cfa8218 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp @@ -0,0 +1,18807 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include "arm_gemm.hpp" + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_smallK_hybrid_fp32_mla_8x1VL(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) { + const long loops_count = iceildiv(N, (int)get_vector_length()) - 1; + const long ldab = lda * sizeof(float); + const long ldcb = ldc * sizeof(float); + const long odd_depth = (K % 4) ? (K % 4) : 4; + const long last_width = N - (loops_count * get_vector_length()); + float nullbias[64]; + if (!bias) { + memset(nullbias, 0, (1 * get_vector_length() * sizeof(float))); + } + float minval = - static_cast(std::numeric_limits::infinity()); + float maxval = static_cast(std::numeric_limits::infinity()); + const float * const minptr = &minval; + const float * const maxptr = &maxval; + + switch(act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + minval = 0.0f; + break; + } + + for (int y0=0; y0() * 1*sizeof(float) : 0; + const float *a_ptr0 = A + (y0 * lda); + + float *c_ptr0 = C + (y0 * ldc); + + switch(K) { + case 1: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4]\n" + "ld1rqw z5.s, p6/z, [a_ptr5]\n" + "ld1rqw z6.s, p6/z, [a_ptr6]\n" + "ld1rqw z7.s, p6/z, [a_ptr7]\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "mov z26.d, z24.d\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z27.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z29.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "mov z26.d, z24.d\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z27.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z29.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4]\n" + "ld1rqw z5.s, p6/z, [a_ptr5]\n" + "ld1rqw z6.s, p6/z, [a_ptr6]\n" + "ld1rqw z7.s, p6/z, [a_ptr7]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z26.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z27.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z28.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "mov z30.d, z24.d\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "mov z26.d, z24.d\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z27.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z29.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4]\n" + "ld1rqw z5.s, p6/z, [a_ptr5]\n" + "ld1rqw z6.s, p6/z, [a_ptr6]\n" + "ld1rqw z7.s, p6/z, [a_ptr7]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z26.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z27.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z28.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "mov z30.d, z24.d\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z26.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z27.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z28.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "mov z30.d, z24.d\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4]\n" + "ld1rqw z5.s, p6/z, [a_ptr5]\n" + "ld1rqw z6.s, p6/z, [a_ptr6]\n" + "ld1rqw z7.s, p6/z, [a_ptr7]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "mov z26.d, z24.d\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z27.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z29.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z25.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "st1w z28.s, p7, [c_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl c_ptr4, c_ptr4, #1\n" + "mov z26.d, z24.d\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z27.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "mov z28.d, z24.d\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr6, c_ptr6, #1\n" + "mov z30.d, z24.d\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "mov z25.d, z24.d\n" + "mov z26.d, z24.d\n" + "mov z27.d, z24.d\n" + "mov z28.d, z24.d\n" + "mov z29.d, z24.d\n" + "mov z30.d, z24.d\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 5: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "mov z27.d, z24.d\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "mov z27.d, z24.d\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 6: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z27.d, z24.d\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "mov z27.d, z24.d\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 7: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z27.d, z24.d\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 8: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 9: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 10: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 11: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 12: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 13: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 14: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 15: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 16: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 17: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 18: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #2\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 19: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #3\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 20: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 21: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "addvl %[b_ptr0], %[b_ptr0], #5\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 22: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #6\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 23: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #7\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + default: + case 24: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "cbz %[oob_rows], 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr7, %[c_ptr0], #0x0\n" + "add a_ptr7, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr6, %[c_ptr0], #0x0\n" + "add a_ptr6, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr5, %[c_ptr0], #0x0\n" + "add a_ptr5, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr4, %[c_ptr0], #0x0\n" + "add a_ptr4, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr3, %[c_ptr0], #0x0\n" + "add a_ptr3, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr2, %[c_ptr0], #0x0\n" + "add a_ptr2, %[a_ptr0], #0x0\n" + "b.eq 1f\n" + "subs %[oob_rows], %[oob_rows], #0x1\n" + "add c_ptr1, %[c_ptr0], #0x0\n" + "add a_ptr1, %[a_ptr0], #0x0\n" + "1:\n" + "ptrue p7.s\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[last_width]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 2f\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "mov z25.d, z24.d\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z26.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z27.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z28.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z29.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z30.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z31.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b.eq 3f\n" + "4:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p7/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z27.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p7, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "st1w z25.s, p7, [c_ptr1]\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "st1w z26.s, p7, [c_ptr2]\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z25.d, z24.d\n" + "st1w z27.s, p7, [c_ptr3]\n" + "mov z26.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z27.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "st1w z28.s, p7, [c_ptr4]\n" + "mov z28.d, z24.d\n" + "addvl c_ptr1, c_ptr1, #1\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "st1w z29.s, p7, [c_ptr5]\n" + "mov z29.d, z24.d\n" + "addvl c_ptr2, c_ptr2, #1\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "st1w z30.s, p7, [c_ptr6]\n" + "mov z30.d, z24.d\n" + "addvl c_ptr3, c_ptr3, #1\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "st1w z31.s, p7, [c_ptr7]\n" + "mov z31.d, z24.d\n" + "addvl c_ptr4, c_ptr4, #1\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "addvl c_ptr5, c_ptr5, #1\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "addvl c_ptr6, c_ptr6, #1\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "addvl c_ptr7, c_ptr7, #1\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "b 5f\n" + "2:\n" + "ld1w z24.s, p0/z, [%[biasptr]]\n" + "add %[biasptr], %[biasptr], %[biasinc]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z25.d, z24.d\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z26.d, z24.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z27.d, z24.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "mov z28.d, z24.d\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "mov z29.d, z24.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "mov z30.d, z24.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z31.d, z24.d\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "ld1w z16.s, p7/z, [%[b_ptr0]]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" + "fmla z24.s, z17.s, z0.s[1]\n" + "fmla z25.s, z17.s, z1.s[1]\n" + "fmla z26.s, z17.s, z2.s[1]\n" + "fmla z27.s, z17.s, z3.s[1]\n" + "fmla z28.s, z17.s, z4.s[1]\n" + "fmla z29.s, z17.s, z5.s[1]\n" + "fmla z30.s, z17.s, z6.s[1]\n" + "fmla z31.s, z17.s, z7.s[1]\n" + "fmla z24.s, z18.s, z0.s[2]\n" + "fmla z25.s, z18.s, z1.s[2]\n" + "fmla z26.s, z18.s, z2.s[2]\n" + "fmla z27.s, z18.s, z3.s[2]\n" + "fmla z28.s, z18.s, z4.s[2]\n" + "fmla z29.s, z18.s, z5.s[2]\n" + "fmla z30.s, z18.s, z6.s[2]\n" + "fmla z31.s, z18.s, z7.s[2]\n" + "fmla z24.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "fmla z25.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z26.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z27.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z19.s, z4.s[3]\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n" + "fmla z29.s, z19.s, z5.s[3]\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n" + "fmla z30.s, z19.s, z6.s[3]\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n" + "fmla z31.s, z19.s, z7.s[3]\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n" + "fmla z24.s, z20.s, z0.s[0]\n" + "fmla z25.s, z20.s, z1.s[0]\n" + "fmla z26.s, z20.s, z2.s[0]\n" + "fmla z27.s, z20.s, z3.s[0]\n" + "fmla z28.s, z20.s, z4.s[0]\n" + "fmla z29.s, z20.s, z5.s[0]\n" + "fmla z30.s, z20.s, z6.s[0]\n" + "fmla z31.s, z20.s, z7.s[0]\n" + "fmla z24.s, z21.s, z0.s[1]\n" + "fmla z25.s, z21.s, z1.s[1]\n" + "fmla z26.s, z21.s, z2.s[1]\n" + "fmla z27.s, z21.s, z3.s[1]\n" + "fmla z28.s, z21.s, z4.s[1]\n" + "fmla z29.s, z21.s, z5.s[1]\n" + "fmla z30.s, z21.s, z6.s[1]\n" + "fmla z31.s, z21.s, z7.s[1]\n" + "fmla z24.s, z22.s, z0.s[2]\n" + "fmla z25.s, z22.s, z1.s[2]\n" + "fmla z26.s, z22.s, z2.s[2]\n" + "fmla z27.s, z22.s, z3.s[2]\n" + "fmla z28.s, z22.s, z4.s[2]\n" + "fmla z29.s, z22.s, z5.s[2]\n" + "fmla z30.s, z22.s, z6.s[2]\n" + "fmla z31.s, z22.s, z7.s[2]\n" + "fmla z24.s, z23.s, z0.s[3]\n" + "fmla z25.s, z23.s, z1.s[3]\n" + "fmla z26.s, z23.s, z2.s[3]\n" + "fmla z27.s, z23.s, z3.s[3]\n" + "fmla z28.s, z23.s, z4.s[3]\n" + "fmla z29.s, z23.s, z5.s[3]\n" + "fmla z30.s, z23.s, z6.s[3]\n" + "fmla z31.s, z23.s, z7.s[3]\n" + "5:\n" + "ld1rw z22.s, p7/z, [%[minptr]]\n" + "ld1rw z23.s, p7/z, [%[maxptr]]\n" + "fmax z24.s, p7/m, z24.s, z22.s\n" + "fmax z25.s, p7/m, z25.s, z22.s\n" + "fmax z26.s, p7/m, z26.s, z22.s\n" + "fmax z27.s, p7/m, z27.s, z22.s\n" + "fmin z24.s, p7/m, z24.s, z23.s\n" + "fmin z25.s, p7/m, z25.s, z23.s\n" + "fmin z26.s, p7/m, z26.s, z23.s\n" + "fmin z27.s, p7/m, z27.s, z23.s\n" + "st1w z24.s, p0, [%[c_ptr0]]\n" + "fmax z28.s, p7/m, z28.s, z22.s\n" + "addvl %[c_ptr0], %[c_ptr0], #1\n" + "fmax z29.s, p7/m, z29.s, z22.s\n" + "st1w z25.s, p0, [c_ptr1]\n" + "fmax z30.s, p7/m, z30.s, z22.s\n" + "fmin z28.s, p7/m, z28.s, z23.s\n" + "fmax z31.s, p7/m, z31.s, z22.s\n" + "st1w z26.s, p0, [c_ptr2]\n" + "fmin z29.s, p7/m, z29.s, z23.s\n" + "fmin z30.s, p7/m, z30.s, z23.s\n" + "fmin z31.s, p7/m, z31.s, z23.s\n" + "st1w z27.s, p0, [c_ptr3]\n" + "st1w z28.s, p0, [c_ptr4]\n" + "st1w z29.s, p0, [c_ptr5]\n" + "st1w z30.s, p0, [c_ptr6]\n" + "st1w z31.s, p0, [c_ptr7]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr) + : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp deleted file mode 100644 index eef1e4cc65..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -#include - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - -class smallK_hybrid_s8s32_dot_1VLx8 -{ -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return get_vector_length() * 1; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8; - - smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp deleted file mode 100644 index e2fbdcb61b..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp +++ /dev/null @@ -1,7503 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)get_vector_length()) - 1; - const long ldab = lda * sizeof(int8_t); - const long ldcb = ldc * sizeof(int32_t); - const long odd_depth = (K % 16) ? (K % 16) : 16; - const long last_width = N - (loops_count * get_vector_length()); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); + +class cls_sve_smallK_hybrid_s8s32_dot_8x1VL +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return get_vector_length() * 1; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return false; + } + + static constexpr bool supports_activation() + { + return false; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_smallK_hybrid_s8s32_dot_8x1VL; + + cls_sve_smallK_hybrid_s8s32_dot_8x1VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp new file mode 100644 index 0000000000..5770076d04 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp @@ -0,0 +1,8971 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool) { + const long loops_count = iceildiv(N, (int)get_vector_length()) - 1; + const long ldab = lda * sizeof(int8_t); + const long ldcb = ldc * sizeof(int32_t); + const long odd_depth = (K % 16) ? (K % 16) : 16; + const long last_width = N - (loops_count * get_vector_length()); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0 - -namespace arm_gemm -{ - -// Actual kernel implementations -void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - -class smallK_hybrid_u8u32_dot_1VLx8 -{ -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return get_vector_length() * 1; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - static constexpr bool supports_accumulate() - { - return false; - } - - static constexpr bool supports_bias() - { - return false; - } - - static constexpr bool supports_activation() - { - return false; - } - - StdTransformsSVE transforms = {}; - - // Default to the generic kernel - kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8; - - smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *) - { - - } -}; - -} // namespace arm_gemm - -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp deleted file mode 100644 index 1d0b84e788..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp +++ /dev/null @@ -1,7503 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef __ARM_FEATURE_SVE - -#include - -#include "arm_gemm.hpp" - -#include -#include "../../asmlib.hpp" -#include "../../utils.hpp" - -namespace arm_gemm { - -void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) { - const long loops_count = iceildiv(N, (int)get_vector_length()) - 1; - const long ldab = lda * sizeof(uint8_t); - const long ldcb = ldc * sizeof(uint32_t); - const long odd_depth = (K % 16) ? (K % 16) : 16; - const long last_width = N - (loops_count * get_vector_length()); - const long odds_count = K % 4; - K = (K + 3) / 4; - - for (int y0=0; y0 + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); + +class cls_sve_smallK_hybrid_u8u32_dot_8x1VL +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return get_vector_length() * 1; + } + + static constexpr unsigned int k_unroll() + { + return 4; + } + + static constexpr bool supports_accumulate() + { + return false; + } + + static constexpr bool supports_bias() + { + return false; + } + + static constexpr bool supports_activation() + { + return false; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_smallK_hybrid_u8u32_dot_8x1VL; + + cls_sve_smallK_hybrid_u8u32_dot_8x1VL(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp new file mode 100644 index 0000000000..b980d9b5c2 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp @@ -0,0 +1,8971 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include "arm_gemm.hpp" + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool) { + const long loops_count = iceildiv(N, (int)get_vector_length()) - 1; + const long ldab = lda * sizeof(uint8_t); + const long ldcb = ldc * sizeof(uint32_t); + const long odd_depth = (K % 16) ? (K % 16) : 16; + const long last_width = N - (loops_count * get_vector_length()); + const long odds_count = K % 4; + K = (K + 3) / 4; + + for (int y0=0; y0(newargs); if (_subgemm == nullptr) { diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp index cac02cf28e..111d01ed3a 100644 --- a/src/core/NEON/kernels/arm_gemm/quantized.cpp +++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp @@ -301,6 +301,179 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne out_ptr1 += 16; } + // We are often quantizing one block of interleaved kernel output at a time - these are three registers + // wide. Special case that here. + if (regs==3) { + regs -= 3; + + int32x4_t v_mul0; + int32x4_t v_mul1; + int32x4_t v_mul2; + + int32x4_t v_shf0; + int32x4_t v_shf1; + int32x4_t v_shf2; + + int32x4_t v_shf0l; + int32x4_t v_shf1l; + int32x4_t v_shf2l; + + if (per_channel) { + v_mul0 = vld1q_s32(perch_mul_ptr); + v_mul1 = vld1q_s32(perch_mul_ptr + 4); + v_mul2 = vld1q_s32(perch_mul_ptr + 8); + perch_mul_ptr += 12; + + v_shf0 = vld1q_s32(perch_shift_ptr); + v_shf1 = vld1q_s32(perch_shift_ptr + 4); + v_shf2 = vld1q_s32(perch_shift_ptr + 8); + perch_shift_ptr += 12; + + if (do_left_shift) { + v_shf0l = vld1q_s32(perch_shiftl_ptr); + v_shf1l = vld1q_s32(perch_shiftl_ptr + 4); + v_shf2l = vld1q_s32(perch_shiftl_ptr + 8); + perch_shiftl_ptr += 12; + } + } else { + v_mul0=v_mul1=v_mul2=v_mul; + v_shf0=v_shf1=v_shf2=v_right_shift; + v_shf0l=v_shf1l=v_shf2l=v_left_shift; + } + + // Load column pointers + int32x4_t v_col0 = vld1q_s32(colptr); + int32x4_t v_col1 = vld1q_s32(colptr + 4); + int32x4_t v_col2 = vld1q_s32(colptr + 8); + colptr += 12; + + // Load input data (row 0); + int32x4_t v_in00 = vld1q_s32(in_ptr); + int32x4_t v_in01 = vld1q_s32(in_ptr + 4); + int32x4_t v_in02 = vld1q_s32(in_ptr + 8); + in_ptr += 12; + + // Load input data (row 1); + int32x4_t v_in10 = vld1q_s32(in_ptr1); + int32x4_t v_in11 = vld1q_s32(in_ptr1 + 4); + int32x4_t v_in12 = vld1q_s32(in_ptr1 + 8); + in_ptr1 += 12; + + // Add on row bias and column bias + v_in00 = vaddq_s32(v_in00, v_row_sum); + v_in01 = vaddq_s32(v_in01, v_row_sum); + v_in02 = vaddq_s32(v_in02, v_row_sum); + + v_in10 = vaddq_s32(v_in10, v_row_sum1); + v_in11 = vaddq_s32(v_in11, v_row_sum1); + v_in12 = vaddq_s32(v_in12, v_row_sum1); + + v_in00 = vaddq_s32(v_in00, v_col0); + v_in01 = vaddq_s32(v_in01, v_col1); + v_in02 = vaddq_s32(v_in02, v_col2); + + v_in10 = vaddq_s32(v_in10, v_col0); + v_in11 = vaddq_s32(v_in11, v_col1); + v_in12 = vaddq_s32(v_in12, v_col2); + + // Quantize + + // If a left shift is needed it needs to happen first. + if (do_left_shift) { + v_in00 = vrshlq_s32(v_in00, v_shf0l); + v_in01 = vrshlq_s32(v_in01, v_shf1l); + v_in02 = vrshlq_s32(v_in02, v_shf2l); + + v_in10 = vrshlq_s32(v_in10, v_shf0l); + v_in11 = vrshlq_s32(v_in11, v_shf1l); + v_in12 = vrshlq_s32(v_in12, v_shf2l); + } + + // Multiply + v_in00 = vqrdmulhq_s32(v_in00, v_mul0); + v_in01 = vqrdmulhq_s32(v_in01, v_mul1); + v_in02 = vqrdmulhq_s32(v_in02, v_mul2); + + v_in10 = vqrdmulhq_s32(v_in10, v_mul0); + v_in11 = vqrdmulhq_s32(v_in11, v_mul1); + v_in12 = vqrdmulhq_s32(v_in12, v_mul2); + + // Compute and add on corrective offset + if (do_shift_correction) { + int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0); + int32x4_t v_temp01 = vandq_s32(v_in01, v_shf1); + int32x4_t v_temp02 = vandq_s32(v_in02, v_shf2); + + int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0); + int32x4_t v_temp11 = vandq_s32(v_in11, v_shf1); + int32x4_t v_temp12 = vandq_s32(v_in12, v_shf2); + + v_temp00 = vshrq_n_s32(v_temp00, 31); + v_temp01 = vshrq_n_s32(v_temp01, 31); + v_temp02 = vshrq_n_s32(v_temp02, 31); + + v_temp10 = vshrq_n_s32(v_temp10, 31); + v_temp11 = vshrq_n_s32(v_temp11, 31); + v_temp12 = vshrq_n_s32(v_temp12, 31); + + v_in00 = vqaddq_s32(v_in00, v_temp00); + v_in01 = vqaddq_s32(v_in01, v_temp01); + v_in02 = vqaddq_s32(v_in02, v_temp02); + + v_in10 = vqaddq_s32(v_in10, v_temp10); + v_in11 = vqaddq_s32(v_in11, v_temp11); + v_in12 = vqaddq_s32(v_in12, v_temp12); + } + + v_in00 = vrshlq_s32(v_in00, v_shf0); + v_in01 = vrshlq_s32(v_in01, v_shf1); + v_in02 = vrshlq_s32(v_in02, v_shf2); + + v_in10 = vrshlq_s32(v_in10, v_shf0); + v_in11 = vrshlq_s32(v_in11, v_shf1); + v_in12 = vrshlq_s32(v_in12, v_shf2); + + v_in00 = vaddq_s32(v_in00, v_c_offset); + v_in01 = vaddq_s32(v_in01, v_c_offset); + v_in02 = vaddq_s32(v_in02, v_c_offset); + + v_in10 = vaddq_s32(v_in10, v_c_offset); + v_in11 = vaddq_s32(v_in11, v_c_offset); + v_in12 = vaddq_s32(v_in12, v_c_offset); + + v_in00 = vmaxq_s32(v_in00, v_minval); + v_in01 = vmaxq_s32(v_in01, v_minval); + v_in02 = vmaxq_s32(v_in02, v_minval); + + v_in10 = vmaxq_s32(v_in10, v_minval); + v_in11 = vmaxq_s32(v_in11, v_minval); + v_in12 = vmaxq_s32(v_in12, v_minval); + + v_in00 = vminq_s32(v_in00, v_maxval); + v_in01 = vminq_s32(v_in01, v_maxval); + v_in02 = vminq_s32(v_in02, v_maxval); + + v_in10 = vminq_s32(v_in10, v_maxval); + v_in11 = vminq_s32(v_in11, v_maxval); + v_in12 = vminq_s32(v_in12, v_maxval); + + int16x8_t v_uz00 = vuzp1q_s16(vreinterpretq_s16_s32(v_in00), vreinterpretq_s16_s32(v_in01)); + int16x8_t v_uz01 = vuzp1q_s16(vreinterpretq_s16_s32(v_in02), vreinterpretq_s16_s32(v_in02)); + + int16x8_t v_uz10 = vuzp1q_s16(vreinterpretq_s16_s32(v_in10), vreinterpretq_s16_s32(v_in11)); + int16x8_t v_uz11 = vuzp1q_s16(vreinterpretq_s16_s32(v_in12), vreinterpretq_s16_s32(v_in12)); + + int8x16_t v_uz0 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz00), vreinterpretq_s8_s16(v_uz01)); + int8x16_t v_uz1 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz10), vreinterpretq_s8_s16(v_uz11)); + + vst1q_lane_s64(reinterpret_cast(out_ptr), vreinterpretq_s64_s8(v_uz0), 0); + vst1q_lane_s32(reinterpret_cast(out_ptr + 8), vreinterpretq_s32_s8(v_uz0), 2); + out_ptr += 12; + vst1q_lane_s64(reinterpret_cast(out_ptr1), vreinterpretq_s64_s8(v_uz1), 0); + vst1q_lane_s32(reinterpret_cast(out_ptr1 + 8), vreinterpretq_s32_s8(v_uz1), 2); + out_ptr1 += 12; + } + while (regs--) { int32x4_t v_mul0; int32x4_t v_shf0; diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp index b0e0c3b580..3f3443025c 100644 --- a/src/core/NEON/kernels/arm_gemm/quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp @@ -23,6 +23,8 @@ */ #pragma once +#include "utils.hpp" // IndirectInputArg + namespace arm_gemm { template @@ -39,4 +41,8 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col); +template +void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, int32_t *output_ptr, const Requantize32 *qp); + } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp new file mode 100644 index 0000000000..5433676558 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp @@ -0,0 +1,1160 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "quantized.hpp" +#include "utils.hpp" + +#include + +namespace arm_gemm { + +template<> +void row_sums_indirect( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, int32_t *out_ptr, const Requantize32 *qp +) +{ + struct KernelArgs { + unsigned int num_strings; + const unsigned int *string_lengths; + unsigned int input_initial_col; + } ka; + + unsigned long flags=0; + void *input_ptr; + size_t input_offset; + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + input_offset=A_arg.direct.stride; + } + + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + + __asm__ __volatile__( + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x19]\n" + "neg v2.4s, v2.4s\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 86f\n" + "cmp %x[M], #0x4\n" + "bgt 69f\n" + "beq 52f\n" + "cmp %x[M], #0x2\n" + "bgt 35f\n" + "beq 18f\n" + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "movi v0.4s, #0x0\n" + "mov x9, #0x0\n" + "mov x28, #0x0\n" + "2:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 3f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "cbnz x28, 4f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "b 4f\n" + "3:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "4:" // Height 1: input setup done + "cmp x27, #0x10\n" + "blt 8f\n" + "cmp x27, #0x20\n" + "blt 7f\n" + "5:" // Height 1: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "blt 6f\n" + "sadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "mov x9, #0x0\n" + "6:" // Height 1: Multiply loop: unique 1: no collapse + "sadalp v1.8h, v31.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 5b\n" + "7:" // Height 1: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "sadalp v1.8h, v31.16b\n" + "8:" // Height 1: Multiply loop: Main loop skip + "cbz x27, 17f\n" + "tbz x27, #3, 12f\n" + "ldr d31, [x26], #0x8\n" + "tbz x27, #2, 10f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "tbz x27, #1, 9f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[14], [x26]\n" + "b 16f\n" + "9:" // Height 1: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[12], [x26]\n" + "b 16f\n" + "10:" // Height 1: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 11f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[10], [x26]\n" + "b 16f\n" + "11:" // Height 1: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[8], [x26]\n" + "b 16f\n" + "12:" // Height 1: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 14f\n" + "ldr s31, [x26], #0x4\n" + "tbz x27, #1, 13f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[6], [x26]\n" + "b 16f\n" + "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[4], [x26]\n" + "b 16f\n" + "14:" // Height 1: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 15f\n" + "ldr h31, [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[2], [x26]\n" + "b 16f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "16:" // Height 1: Multiply loop: Ragged operand read: Done + "sadalp v1.8h, v31.16b\n" + "17:" // Height 1: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 2b\n" + "sadalp v0.4s, v1.8h\n" + "addp v0.4s, v0.4s, v0.4s\n" + "addp v0.4s, v0.4s, v0.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "str s0, [%x[out_ptr]], #0x4\n" + "b 104f\n" + "18:" // Height 2 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "19:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "cbnz x28, 21f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "21:" // Height 2: input setup done + "cmp x27, #0x10\n" + "blt 25f\n" + "cmp x27, #0x20\n" + "blt 24f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "blt 23f\n" + "sadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "sadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "mov x9, #0x0\n" + "23:" // Height 2: Multiply loop: unique 2: no collapse + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 22b\n" + "24:" // Height 2: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "25:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 34f\n" + "tbz x27, #3, 29f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "tbz x27, #2, 27f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "tbz x27, #1, 26f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "b 33f\n" + "26:" // Height 2: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "b 33f\n" + "27:" // Height 2: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 28f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "b 33f\n" + "28:" // Height 2: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "b 33f\n" + "29:" // Height 2: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 31f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "tbz x27, #1, 30f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "b 33f\n" + "30:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "b 33f\n" + "31:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 32f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "b 33f\n" + "32:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "33:" // Height 2: Multiply loop: Ragged operand read: Done + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "34:" // Height 2: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 19b\n" + "sadalp v0.4s, v1.8h\n" + "sadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "addp v0.4s, v0.4s, v0.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "str d0, [%x[out_ptr]], #0x8\n" + "b 104f\n" + "35:" // Height 3 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "36:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 37f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "cbnz x28, 38f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 38f\n" + "37:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "38:" // Height 3: input setup done + "cmp x27, #0x10\n" + "blt 42f\n" + "cmp x27, #0x20\n" + "blt 41f\n" + "39:" // Height 3: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "blt 40f\n" + "sadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "sadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "sadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "mov x9, #0x0\n" + "40:" // Height 3: Multiply loop: unique 3: no collapse + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 39b\n" + "41:" // Height 3: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "add x24, x24, #0x10\n" + "42:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 51f\n" + "tbz x27, #3, 46f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "tbz x27, #2, 44f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "tbz x27, #1, 43f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "b 50f\n" + "43:" // Height 3: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "b 50f\n" + "44:" // Height 3: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 45f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "b 50f\n" + "45:" // Height 3: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "b 50f\n" + "46:" // Height 3: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 48f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "tbz x27, #1, 47f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "b 50f\n" + "47:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "b 50f\n" + "48:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 49f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "b 50f\n" + "49:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "50:" // Height 3: Multiply loop: Ragged operand read: Done + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "51:" // Height 3: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 36b\n" + "sadalp v0.4s, v1.8h\n" + "sadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "sadalp v26.4s, v27.8h\n" + "addp v0.4s, v0.4s, v0.4s\n" + "addp v26.4s, v26.4s, v26.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "str d0, [%x[out_ptr]], #0x8\n" + "addp v26.4s, v26.4s, v26.4s\n" + "mul v26.4s, v26.4s, v2.4s\n" + "str s26, [%x[out_ptr]], #0x4\n" + "b 104f\n" + "52:" // Height 4 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v23.4s, #0x0\n" + "53:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 54f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "ldr x23, [x19, #0x18]\n" + "cbnz x28, 55f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 55f\n" + "54:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "add x23, x24, %x[input_offset]\n" + "55:" // Height 4: input setup done + "cmp x27, #0x10\n" + "blt 59f\n" + "cmp x27, #0x20\n" + "blt 58f\n" + "56:" // Height 4: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "blt 57f\n" + "sadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "sadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "sadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "sadalp v23.4s, v24.8h\n" + "movi v24.8h, #0x0\n" + "mov x9, #0x0\n" + "57:" // Height 4: Multiply loop: unique 4: no collapse + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 56b\n" + "58:" // Height 4: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "add x26, x26, #0x10\n" + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "59:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 68f\n" + "tbz x27, #3, 63f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "tbz x27, #2, 61f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "tbz x27, #1, 60f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "b 67f\n" + "60:" // Height 4: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "b 67f\n" + "61:" // Height 4: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 62f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "b 67f\n" + "62:" // Height 4: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "b 67f\n" + "63:" // Height 4: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 65f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "tbz x27, #1, 64f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "b 67f\n" + "64:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "b 67f\n" + "65:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 66f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "b 67f\n" + "66:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "67:" // Height 4: Multiply loop: Ragged operand read: Done + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "68:" // Height 4: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 53b\n" + "sadalp v0.4s, v1.8h\n" + "sadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "sadalp v26.4s, v27.8h\n" + "sadalp v23.4s, v24.8h\n" + "addp v29.4s, v26.4s, v23.4s\n" + "addp v0.4s, v0.4s, v29.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "st1 { v0.4s }, [%x[out_ptr]], #0x10\n" + "b 104f\n" + "69:" // Height 5 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v21.8h, #0x0\n" + "movi v20.4s, #0x0\n" + "70:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 71f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "ldr x23, [x19, #0x18]\n" + "ldr x22, [x19, #0x20]\n" + "cbnz x28, 72f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 72f\n" + "71:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "add x23, x24, %x[input_offset]\n" + "add x22, x23, %x[input_offset]\n" + "72:" // Height 5: input setup done + "cmp x27, #0x10\n" + "blt 76f\n" + "cmp x27, #0x20\n" + "blt 75f\n" + "73:" // Height 5: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "blt 74f\n" + "sadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "sadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "sadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "sadalp v23.4s, v24.8h\n" + "movi v24.8h, #0x0\n" + "sadalp v20.4s, v21.8h\n" + "movi v21.8h, #0x0\n" + "mov x9, #0x0\n" + "74:" // Height 5: Multiply loop: unique 5: no collapse + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "sadalp v21.8h, v19.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 73b\n" + "75:" // Height 5: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "sadalp v21.8h, v19.16b\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "76:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 85f\n" + "tbz x27, #3, 80f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x27, #2, 78f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "tbz x27, #1, 77f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "b 84f\n" + "77:" // Height 5: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "b 84f\n" + "78:" // Height 5: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 79f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "b 84f\n" + "79:" // Height 5: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "b 84f\n" + "80:" // Height 5: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 82f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "tbz x27, #1, 81f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "b 84f\n" + "81:" // Height 5: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "b 84f\n" + "82:" // Height 5: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 83f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "b 84f\n" + "83:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "84:" // Height 5: Multiply loop: Ragged operand read: Done + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "sadalp v21.8h, v19.16b\n" + "85:" // Height 5: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 70b\n" + "sadalp v0.4s, v1.8h\n" + "sadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "sadalp v26.4s, v27.8h\n" + "sadalp v23.4s, v24.8h\n" + "addp v29.4s, v26.4s, v23.4s\n" + "sadalp v20.4s, v21.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "addp v20.4s, v20.4s, v20.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "st1 { v0.4s }, [%x[out_ptr]], #0x10\n" + "addp v20.4s, v20.4s, v20.4s\n" + "mul v20.4s, v20.4s, v2.4s\n" + "str s20, [%x[out_ptr]], #0x4\n" + "b 104f\n" + "86:" // Height 6 + "movi v1.8h, #0x0\n" + "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v21.8h, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v18.8h, #0x0\n" + "movi v17.4s, #0x0\n" + "87:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 88f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "ldr x23, [x19, #0x18]\n" + "ldr x22, [x19, #0x20]\n" + "ldr x20, [x19, #0x28]\n" + "cbnz x28, 89f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 89f\n" + "88:" // Height 6: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "add x23, x24, %x[input_offset]\n" + "add x22, x23, %x[input_offset]\n" + "add x20, x22, %x[input_offset]\n" + "89:" // Height 6: input setup done + "cmp x27, #0x10\n" + "blt 93f\n" + "cmp x27, #0x20\n" + "blt 92f\n" + "90:" // Height 6: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "ldr q16, [x20, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "add x20, x20, #0x10\n" + "blt 91f\n" + "sadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "sadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "sadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "sadalp v23.4s, v24.8h\n" + "movi v24.8h, #0x0\n" + "sadalp v20.4s, v21.8h\n" + "movi v21.8h, #0x0\n" + "sadalp v17.4s, v18.8h\n" + "movi v18.8h, #0x0\n" + "mov x9, #0x0\n" + "91:" // Height 6: Multiply loop: unique 6: no collapse + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "sadalp v21.8h, v19.16b\n" + "sadalp v18.8h, v16.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 90b\n" + "92:" // Height 6: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "ldr q16, [x20, #0x0]\n" + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "sadalp v21.8h, v19.16b\n" + "sadalp v18.8h, v16.16b\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "add x20, x20, #0x10\n" + "93:" // Height 6: Multiply loop: Main loop skip + "cbz x27, 102f\n" + "tbz x27, #3, 97f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d16, [x20], #0x8\n" + "tbz x27, #2, 95f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" + "tbz x27, #1, 94f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "ld1 { v16.b }[14], [x20]\n" + "b 101f\n" + "94:" // Height 6: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "ld1 { v16.b }[12], [x20]\n" + "b 101f\n" + "95:" // Height 6: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 96f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "ld1 { v16.b }[10], [x20]\n" + "b 101f\n" + "96:" // Height 6: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "ld1 { v16.b }[8], [x20]\n" + "b 101f\n" + "97:" // Height 6: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 99f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s16, [x20], #0x4\n" + "tbz x27, #1, 98f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "ld1 { v16.b }[6], [x20]\n" + "b 101f\n" + "98:" // Height 6: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "ld1 { v16.b }[4], [x20]\n" + "b 101f\n" + "99:" // Height 6: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 100f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "ldr h16, [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "ld1 { v16.b }[2], [x20]\n" + "b 101f\n" + "100:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "ldr b16, [x20, #0x0]\n" + "101:" // Height 6: Multiply loop: Ragged operand read: Done + "sadalp v1.8h, v31.16b\n" + "sadalp v30.8h, v28.16b\n" + "sadalp v27.8h, v25.16b\n" + "sadalp v24.8h, v22.16b\n" + "sadalp v21.8h, v19.16b\n" + "sadalp v18.8h, v16.16b\n" + "102:" // Height 6: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x21\n" + "bne 87b\n" + "sadalp v0.4s, v1.8h\n" + "sadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "sadalp v26.4s, v27.8h\n" + "sadalp v23.4s, v24.8h\n" + "addp v29.4s, v26.4s, v23.4s\n" + "sadalp v20.4s, v21.8h\n" + "sadalp v17.4s, v18.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "subs %x[M], %x[M], #0x6\n" + "addp v20.4s, v20.4s, v17.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "st1 { v0.4s }, [%x[out_ptr]], #0x10\n" + "addp v20.4s, v20.4s, v20.4s\n" + "mul v20.4s, v20.4s, v2.4s\n" + "str d20, [%x[out_ptr]], #0x8\n" + "beq 104f\n" + "tbz %x[flags], #3, 103f\n" + "add %x[input_offset], %x[input_offset], #0x6\n" + "b 1b\n" + "103:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n" + "b 1b\n" + "104:" // Exit + + : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp new file mode 100644 index 0000000000..f5709d92ac --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp @@ -0,0 +1,1160 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "quantized.hpp" +#include "utils.hpp" + +#include + +namespace arm_gemm { + +template<> +void row_sums_indirect( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, int32_t *out_ptr, const Requantize32 *qp +) +{ + struct KernelArgs { + unsigned int num_strings; + const unsigned int *string_lengths; + unsigned int input_initial_col; + } ka; + + unsigned long flags=0; + void *input_ptr; + size_t input_offset; + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + input_offset=A_arg.direct.stride; + } + + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + + __asm__ __volatile__( + "add x19, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x19]\n" + "neg v2.4s, v2.4s\n" + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 86f\n" + "cmp %x[M], #0x4\n" + "bgt 69f\n" + "beq 52f\n" + "cmp %x[M], #0x2\n" + "bgt 35f\n" + "beq 18f\n" + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "movi v0.4s, #0x0\n" + "mov x9, #0x0\n" + "mov x28, #0x0\n" + "2:" // Height 1: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 3f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "cbnz x28, 4f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "b 4f\n" + "3:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "4:" // Height 1: input setup done + "cmp x27, #0x10\n" + "blt 8f\n" + "cmp x27, #0x20\n" + "blt 7f\n" + "5:" // Height 1: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "blt 6f\n" + "uadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "mov x9, #0x0\n" + "6:" // Height 1: Multiply loop: unique 1: no collapse + "uadalp v1.8h, v31.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 5b\n" + "7:" // Height 1: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "uadalp v1.8h, v31.16b\n" + "8:" // Height 1: Multiply loop: Main loop skip + "cbz x27, 17f\n" + "tbz x27, #3, 12f\n" + "ldr d31, [x26], #0x8\n" + "tbz x27, #2, 10f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "tbz x27, #1, 9f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[14], [x26]\n" + "b 16f\n" + "9:" // Height 1: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[12], [x26]\n" + "b 16f\n" + "10:" // Height 1: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 11f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[10], [x26]\n" + "b 16f\n" + "11:" // Height 1: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[8], [x26]\n" + "b 16f\n" + "12:" // Height 1: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 14f\n" + "ldr s31, [x26], #0x4\n" + "tbz x27, #1, 13f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[6], [x26]\n" + "b 16f\n" + "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[4], [x26]\n" + "b 16f\n" + "14:" // Height 1: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 15f\n" + "ldr h31, [x26], #0x2\n" + "tbz x27, #0, 16f\n" + "ld1 { v31.b }[2], [x26]\n" + "b 16f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "16:" // Height 1: Multiply loop: Ragged operand read: Done + "uadalp v1.8h, v31.16b\n" + "17:" // Height 1: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 2b\n" + "uadalp v0.4s, v1.8h\n" + "addp v0.4s, v0.4s, v0.4s\n" + "addp v0.4s, v0.4s, v0.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "str s0, [%x[out_ptr]], #0x4\n" + "b 104f\n" + "18:" // Height 2 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "19:" // Height 2: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "cbnz x28, 21f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "21:" // Height 2: input setup done + "cmp x27, #0x10\n" + "blt 25f\n" + "cmp x27, #0x20\n" + "blt 24f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "blt 23f\n" + "uadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "uadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "mov x9, #0x0\n" + "23:" // Height 2: Multiply loop: unique 2: no collapse + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 22b\n" + "24:" // Height 2: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "25:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 34f\n" + "tbz x27, #3, 29f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "tbz x27, #2, 27f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "tbz x27, #1, 26f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "b 33f\n" + "26:" // Height 2: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "b 33f\n" + "27:" // Height 2: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 28f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "b 33f\n" + "28:" // Height 2: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "b 33f\n" + "29:" // Height 2: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 31f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "tbz x27, #1, 30f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "b 33f\n" + "30:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "b 33f\n" + "31:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 32f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "tbz x27, #0, 33f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "b 33f\n" + "32:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "33:" // Height 2: Multiply loop: Ragged operand read: Done + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "34:" // Height 2: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 19b\n" + "uadalp v0.4s, v1.8h\n" + "uadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "addp v0.4s, v0.4s, v0.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "str d0, [%x[out_ptr]], #0x8\n" + "b 104f\n" + "35:" // Height 3 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "36:" // Height 3: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 37f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "cbnz x28, 38f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 38f\n" + "37:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "38:" // Height 3: input setup done + "cmp x27, #0x10\n" + "blt 42f\n" + "cmp x27, #0x20\n" + "blt 41f\n" + "39:" // Height 3: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "blt 40f\n" + "uadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "uadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "uadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "mov x9, #0x0\n" + "40:" // Height 3: Multiply loop: unique 3: no collapse + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 39b\n" + "41:" // Height 3: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "add x24, x24, #0x10\n" + "42:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 51f\n" + "tbz x27, #3, 46f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "tbz x27, #2, 44f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "tbz x27, #1, 43f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "b 50f\n" + "43:" // Height 3: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "b 50f\n" + "44:" // Height 3: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 45f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "b 50f\n" + "45:" // Height 3: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "b 50f\n" + "46:" // Height 3: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 48f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "tbz x27, #1, 47f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "b 50f\n" + "47:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "b 50f\n" + "48:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 49f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "tbz x27, #0, 50f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "b 50f\n" + "49:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "50:" // Height 3: Multiply loop: Ragged operand read: Done + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "51:" // Height 3: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 36b\n" + "uadalp v0.4s, v1.8h\n" + "uadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "uadalp v26.4s, v27.8h\n" + "addp v0.4s, v0.4s, v0.4s\n" + "addp v26.4s, v26.4s, v26.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "str d0, [%x[out_ptr]], #0x8\n" + "addp v26.4s, v26.4s, v26.4s\n" + "mul v26.4s, v26.4s, v2.4s\n" + "str s26, [%x[out_ptr]], #0x4\n" + "b 104f\n" + "52:" // Height 4 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v23.4s, #0x0\n" + "53:" // Height 4: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 54f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "ldr x23, [x19, #0x18]\n" + "cbnz x28, 55f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 55f\n" + "54:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "add x23, x24, %x[input_offset]\n" + "55:" // Height 4: input setup done + "cmp x27, #0x10\n" + "blt 59f\n" + "cmp x27, #0x20\n" + "blt 58f\n" + "56:" // Height 4: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "blt 57f\n" + "uadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "uadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "uadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "uadalp v23.4s, v24.8h\n" + "movi v24.8h, #0x0\n" + "mov x9, #0x0\n" + "57:" // Height 4: Multiply loop: unique 4: no collapse + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 56b\n" + "58:" // Height 4: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "add x26, x26, #0x10\n" + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "59:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 68f\n" + "tbz x27, #3, 63f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "tbz x27, #2, 61f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "tbz x27, #1, 60f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "b 67f\n" + "60:" // Height 4: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "b 67f\n" + "61:" // Height 4: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 62f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "b 67f\n" + "62:" // Height 4: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "b 67f\n" + "63:" // Height 4: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 65f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "tbz x27, #1, 64f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "b 67f\n" + "64:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "b 67f\n" + "65:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 66f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "tbz x27, #0, 67f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "b 67f\n" + "66:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "67:" // Height 4: Multiply loop: Ragged operand read: Done + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "68:" // Height 4: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 53b\n" + "uadalp v0.4s, v1.8h\n" + "uadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "uadalp v26.4s, v27.8h\n" + "uadalp v23.4s, v24.8h\n" + "addp v29.4s, v26.4s, v23.4s\n" + "addp v0.4s, v0.4s, v29.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "st1 { v0.4s }, [%x[out_ptr]], #0x10\n" + "b 104f\n" + "69:" // Height 5 + "movi v1.8h, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v21.8h, #0x0\n" + "movi v20.4s, #0x0\n" + "70:" // Height 5: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 71f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "ldr x23, [x19, #0x18]\n" + "ldr x22, [x19, #0x20]\n" + "cbnz x28, 72f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 72f\n" + "71:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "add x23, x24, %x[input_offset]\n" + "add x22, x23, %x[input_offset]\n" + "72:" // Height 5: input setup done + "cmp x27, #0x10\n" + "blt 76f\n" + "cmp x27, #0x20\n" + "blt 75f\n" + "73:" // Height 5: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "blt 74f\n" + "uadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "uadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "uadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "uadalp v23.4s, v24.8h\n" + "movi v24.8h, #0x0\n" + "uadalp v20.4s, v21.8h\n" + "movi v21.8h, #0x0\n" + "mov x9, #0x0\n" + "74:" // Height 5: Multiply loop: unique 5: no collapse + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "uadalp v21.8h, v19.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 73b\n" + "75:" // Height 5: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "uadalp v21.8h, v19.16b\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "76:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 85f\n" + "tbz x27, #3, 80f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x27, #2, 78f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "tbz x27, #1, 77f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "b 84f\n" + "77:" // Height 5: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "b 84f\n" + "78:" // Height 5: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 79f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "b 84f\n" + "79:" // Height 5: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "b 84f\n" + "80:" // Height 5: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 82f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "tbz x27, #1, 81f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "b 84f\n" + "81:" // Height 5: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "b 84f\n" + "82:" // Height 5: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 83f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "tbz x27, #0, 84f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "b 84f\n" + "83:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "84:" // Height 5: Multiply loop: Ragged operand read: Done + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "uadalp v21.8h, v19.16b\n" + "85:" // Height 5: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 70b\n" + "uadalp v0.4s, v1.8h\n" + "uadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "uadalp v26.4s, v27.8h\n" + "uadalp v23.4s, v24.8h\n" + "addp v29.4s, v26.4s, v23.4s\n" + "uadalp v20.4s, v21.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "addp v20.4s, v20.4s, v20.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "st1 { v0.4s }, [%x[out_ptr]], #0x10\n" + "addp v20.4s, v20.4s, v20.4s\n" + "mul v20.4s, v20.4s, v2.4s\n" + "str s20, [%x[out_ptr]], #0x4\n" + "b 104f\n" + "86:" // Height 6 + "movi v1.8h, #0x0\n" + "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n" + "mov x9, #0x0\n" + "movi v0.4s, #0x0\n" + "mov x28, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v21.8h, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v18.8h, #0x0\n" + "movi v17.4s, #0x0\n" + "87:" // Height 6: String loop + "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x19, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 88f\n" + "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n" + "add x19, x19, %x[input_offset], LSL #3\n" + "ldr x26, [x19, #0x0]\n" + "ldr x25, [x19, #0x8]\n" + "ldr x24, [x19, #0x10]\n" + "ldr x23, [x19, #0x18]\n" + "ldr x22, [x19, #0x20]\n" + "ldr x20, [x19, #0x28]\n" + "cbnz x28, 89f\n" + "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x19\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 89f\n" + "88:" // Height 6: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, %x[input_offset]\n" + "add x24, x25, %x[input_offset]\n" + "add x23, x24, %x[input_offset]\n" + "add x22, x23, %x[input_offset]\n" + "add x20, x22, %x[input_offset]\n" + "89:" // Height 6: input setup done + "cmp x27, #0x10\n" + "blt 93f\n" + "cmp x27, #0x20\n" + "blt 92f\n" + "90:" // Height 6: Multiply loop: Main loop head + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "ldr q16, [x20, #0x0]\n" + "cmp x9, #0x7e\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "add x20, x20, #0x10\n" + "blt 91f\n" + "uadalp v0.4s, v1.8h\n" + "movi v1.8h, #0x0\n" + "uadalp v29.4s, v30.8h\n" + "movi v30.8h, #0x0\n" + "uadalp v26.4s, v27.8h\n" + "movi v27.8h, #0x0\n" + "uadalp v23.4s, v24.8h\n" + "movi v24.8h, #0x0\n" + "uadalp v20.4s, v21.8h\n" + "movi v21.8h, #0x0\n" + "uadalp v17.4s, v18.8h\n" + "movi v18.8h, #0x0\n" + "mov x9, #0x0\n" + "91:" // Height 6: Multiply loop: unique 6: no collapse + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "uadalp v21.8h, v19.16b\n" + "uadalp v18.8h, v16.16b\n" + "add x9, x9, #0x1\n" + "sub x27, x27, #0x10\n" + "cmp x27, #0x20\n" + "bge 90b\n" + "92:" // Height 6: Multiply loop: Single iteration only + "sub x27, x27, #0x10\n" + "ldr q31, [x26, #0x0]\n" + "ldr q28, [x25, #0x0]\n" + "ldr q25, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q19, [x22, #0x0]\n" + "ldr q16, [x20, #0x0]\n" + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "uadalp v21.8h, v19.16b\n" + "uadalp v18.8h, v16.16b\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "add x20, x20, #0x10\n" + "93:" // Height 6: Multiply loop: Main loop skip + "cbz x27, 102f\n" + "tbz x27, #3, 97f\n" + "ldr d31, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d16, [x20], #0x8\n" + "tbz x27, #2, 95f\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v28.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" + "tbz x27, #1, 94f\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v28.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[14], [x26]\n" + "ld1 { v28.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "ld1 { v16.b }[14], [x20]\n" + "b 101f\n" + "94:" // Height 6: Multiply loop: Ragged operand read: partial_1_12 + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[12], [x26]\n" + "ld1 { v28.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "ld1 { v16.b }[12], [x20]\n" + "b 101f\n" + "95:" // Height 6: Multiply loop: Ragged operand read: partial_2_8 + "tbz x27, #1, 96f\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v28.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[10], [x26]\n" + "ld1 { v28.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "ld1 { v16.b }[10], [x20]\n" + "b 101f\n" + "96:" // Height 6: Multiply loop: Ragged operand read: partial_1_8 + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[8], [x26]\n" + "ld1 { v28.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "ld1 { v16.b }[8], [x20]\n" + "b 101f\n" + "97:" // Height 6: Multiply loop: Ragged operand read: partial_4_0 + "tbz x27, #2, 99f\n" + "ldr s31, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s16, [x20], #0x4\n" + "tbz x27, #1, 98f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "ld1 { v16.b }[6], [x20]\n" + "b 101f\n" + "98:" // Height 6: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "ld1 { v16.b }[4], [x20]\n" + "b 101f\n" + "99:" // Height 6: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 100f\n" + "ldr h31, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "ldr h16, [x20], #0x2\n" + "tbz x27, #0, 101f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "ld1 { v16.b }[2], [x20]\n" + "b 101f\n" + "100:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b31, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "ldr b16, [x20, #0x0]\n" + "101:" // Height 6: Multiply loop: Ragged operand read: Done + "uadalp v1.8h, v31.16b\n" + "uadalp v30.8h, v28.16b\n" + "uadalp v27.8h, v25.16b\n" + "uadalp v24.8h, v22.16b\n" + "uadalp v21.8h, v19.16b\n" + "uadalp v18.8h, v16.16b\n" + "102:" // Height 6: Multiply loop: No odd multiplies + "add x28, x28, #0x1\n" + "cmp x28, x21\n" + "bne 87b\n" + "uadalp v0.4s, v1.8h\n" + "uadalp v29.4s, v30.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "uadalp v26.4s, v27.8h\n" + "uadalp v23.4s, v24.8h\n" + "addp v29.4s, v26.4s, v23.4s\n" + "uadalp v20.4s, v21.8h\n" + "uadalp v17.4s, v18.8h\n" + "addp v0.4s, v0.4s, v29.4s\n" + "subs %x[M], %x[M], #0x6\n" + "addp v20.4s, v20.4s, v17.4s\n" + "mul v0.4s, v0.4s, v2.4s\n" + "st1 { v0.4s }, [%x[out_ptr]], #0x10\n" + "addp v20.4s, v20.4s, v20.4s\n" + "mul v20.4s, v20.4s, v2.4s\n" + "str d20, [%x[out_ptr]], #0x8\n" + "beq 104f\n" + "tbz %x[flags], #3, 103f\n" + "add %x[input_offset], %x[input_offset], #0x6\n" + "b 1b\n" + "103:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n" + "b 1b\n" + "104:" // Exit + + : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp index 1d3aee7911..4669be9993 100644 --- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp +++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp @@ -23,8 +23,10 @@ */ #pragma once +#include "convolver.hpp" #include "mergeresults.hpp" #include "transform.hpp" +#include "interleave_indirect.hpp" namespace arm_gemm { @@ -39,14 +41,26 @@ namespace arm_gemm { * The optional 'block' parameter is for kernels using dot-product type * instructions like UDOT and SDOT. */ -template +template class StdTransformsFixed { public: template void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0, - const int ymax, const int k0, const int kmax) const { - Transform(out, in, stride, y0, ymax, k0, kmax); + const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) const { + Interleave(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier); + } + + template + void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0, + const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) { + IndirectInterleave(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier); + } + + template + void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver &conv, size_t rounded_stringlen, + const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) { + ConvolutionInterleave(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier); } template diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp index 13c4c477c6..3256d919ea 100644 --- a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp +++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp @@ -23,6 +23,7 @@ */ #pragma once +#include "convolver.hpp" #include "mergeresults.hpp" #include "transform.hpp" @@ -38,20 +39,32 @@ namespace arm_gemm { * The optional 'block' parameter is for kernels using dot-product type * instructions like UDOT and SDOT. */ -template +template class StdTransformsSVE { public: template void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0, - const int ymax, const int k0, const int kmax) { - Transform(out, in, stride, y0, ymax, k0, kmax); + const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) { + Interleave(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier); + } + + template + void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0, + const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) { + IndirectInterleave(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier); + } + + template + void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver &conv, size_t rounded_stringlen, + const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) { + ConvolutionInterleave(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier); } template void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { - Transform(out, in, stride, x0, xmax, k0, kmax); + Transform(out, in, stride, x0, xmax, k0, kmax); } template diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp index c6ea079882..5efeee5d35 100644 --- a/src/core/NEON/kernels/arm_gemm/transform.hpp +++ b/src/core/NEON/kernels/arm_gemm/transform.hpp @@ -38,13 +38,13 @@ namespace arm_gemm { * Need to cope with the work requested in either dimension not actually * being a multiple of the block sizes. */ -template +template struct TransformImpl { template static void Transform(TOut* out, const TIn* const in, const int stride, const int y0, const int ymax, const int x0, const int xmax) { // For SVE cases we multiply the interleave factor by the vector length. - const unsigned int IntBy = tIntBy * (sve ? get_vector_length() / BlockBy : 1); + const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length() / BlockBy : 1); const int n_whole_y_blocks = (ymax - y0) / IntBy; const int y_remainders = (ymax - y0) % IntBy; @@ -105,13 +105,13 @@ struct TransformImpl { }; /*****************************************************************************/ -template +template void Transform( TOut* out, const TIn* const in, const int stride, const int k0, const int kmax, const int x0, const int xmax ) { // Redirect to a specialised implementation predicated on argument size. - TransformImpl::Transform( + TransformImpl::Transform( out, in, stride, k0, kmax, x0, xmax ); } diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp deleted file mode 100644 index 2df5d1bd28..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include - -#include "../asmlib.hpp" - -template<> -template -inline void TransformImpl<6, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint32_t *outptr = reinterpret_cast(out); - const uint32_t *inptr = reinterpret_cast(in); - bool first = true; - - uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop - - for (int y=y0; y7) || first;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - /* 'first' forces this to always run at least once, needed if the total size is <=7. */ - if ((y + 5) >= ymax) { - switch ((y + 5) - ymax) { - case 4: - inptr1 = zerobuff; - // fall through - case 3: - inptr2 = zerobuff; - // fall through - case 2: - inptr3 = zerobuff; - // fall through - case 1: - inptr4 = zerobuff; - // fall through - case 0: - inptr5 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - if (first) { - if (x<=7) { - break; - } - - first = false; - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3 - "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3 - "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3 - "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3 - "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3 - "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3 - "VLD1.32 {d16-d19}, [%[inptr4]]!\n" - "VLD1.32 {d20-d23}, [%[inptr5]]!\n" - "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3 - ASM_PREFETCH("[%[inptr0], #128]") - "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1 - - // Store first elements - "VST1.32 {d0-d1}, [%[outptr]]!\n" - "VST1.32 {d16}, [%[outptr]]!\n" - - "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3 - - // Store second elements - "VST1.32 {d4-d5}, [%[outptr]]!\n" - "VZIP.32 q1, q5\n" - ASM_PREFETCH("[%[inptr1], #128]") - "VST1.32 {d17}, [%[outptr]]!\n" - "VZIP.32 q3, q7\n" - - // Store third elements - "VZIP.32 q9, q11\n" - "VST1.32 {d8-d9}, [%[outptr]]!\n" - "VZIP.32 q1, q3\n" - ASM_PREFETCH("[%[inptr2], #128]") - "VST1.32 {d20}, [%[outptr]]!\n" - - // Store fourth elements - "VZIP.32 q5, q7\n" - "VST1.32 {d12-d13}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr3], #128]") - "VST1.32 {d21}, [%[outptr]]!\n" - - // Fifth - "VST1.32 {d2-d3}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr4], #128]") - "VST1.32 {d18}, [%[outptr]]!\n" - - // Sixth - "VST1.32 {d6-d7}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr5], #128]") - "VST1.32 {d19}, [%[outptr]]!\n" - - // Seventh - "VST1.32 {d10-d11}, [%[outptr]]!\n" - "VST1.32 {d22}, [%[outptr]]!\n" - - // Eighth - "VST1.32 {d14-d15}, [%[outptr]]!\n" - "VST1.32 {d23}, [%[outptr]]!\n" - - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - } - } -} - -#endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp index 8f0b8ae63f..3ce1d328a7 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp @@ -30,22 +30,22 @@ // Generic unblocked transposed 8x32-bit sized specialisation template <> template -inline void TransformImpl<8, 1, true, 4, 4, false>::Transform( +inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { // Redirect to a 16x uint16_t specialisation - TransformImpl<16, 1, true, 2, 2, false>::Transform( + TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( reinterpret_cast(out), reinterpret_cast(in), stride*2, x0*2, xmax*2, k0, kmax ); } -// Generic 12x16-bit sized specialisation +// Generic 16x16-bit sized specialisation template <> template -inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { @@ -117,7 +117,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con template <> template <> -inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( uint16_t* out, const uint16_t* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp deleted file mode 100644 index 9b6f4de543..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include - -#include "../asmlib.hpp" -#include "../utils.hpp" - -template<> -template -void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint8_t *outptr = (uint8_t *)out; - const uint8_t *inptr = (uint8_t *)in; - - uint8_t zerobuff[16] = { 0 }; - - for (int y=y0; y(y) * ldin + k0; - const uint8_t *inptr1 = inptr0 + ldin; - const uint8_t *inptr2 = inptr1 + ldin; - const uint8_t *inptr3 = inptr2 + ldin; - - prefetch_2x(inptr0); - prefetch_2x(inptr1); - prefetch_2x(inptr2); - prefetch_2x(inptr3); - - int x=(kmax-k0); - for (;x>15;x-=16) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - case 2: - inptr1 = zerobuff; - // fall through - case 1: - inptr2 = zerobuff; - // fall through - case 0: - inptr3 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - __asm __volatile ( - "LDR q0, [%[inptr0]], #16\n" - ASM_PREFETCH("[%[inptr0], #176]") - "LDR q1, [%[inptr1]], #16\n" - ASM_PREFETCH("[%[inptr1], #176]") - "STP q0, q1, [%[outptr]], #32\n" - "LDR q0, [%[inptr2]], #16\n" - ASM_PREFETCH("[%[inptr2], #176]") - "LDR q1, [%[inptr3]], #16\n" - ASM_PREFETCH("[%[inptr3], #176]") - "STP q0, q1, [%[outptr]], #32\n" - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [outptr] "+r" (outptr) - : - : "v0", "v1" - ); - } - - if (x>0) { - /* Need to duplicate this here, in case we didn't run the main loop. */ - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - case 2: - inptr1 = zerobuff; - // fall through - case 1: - inptr2 = zerobuff; - // fall through - case 0: - inptr3 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */ - auto f = [&outptr, x](const uint8_t *&p) { - for (int i=0; i<16; i++) { - if (i < x) { - *outptr++ = *p++; - } else { - *outptr++ = 0; - } - } - }; - - f(inptr0); - f(inptr1); - f(inptr2); - f(inptr3); - } - } -} - -#endif // __aarch64__ \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp deleted file mode 100644 index 3d912c4675..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include - -#include "../asmlib.hpp" - -template<> -template -void TransformImpl<8, 1, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint16_t *outptr = (uint16_t *)out; - const uint16_t *inptr = (const uint16_t *)in; - bool first=true; - - uint16_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop - - for (int y=y0; y7) || first;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - /* 'first' forces this to always run at least once, needed if the total size is <=7. */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - inptr1 = zerobuff; - // fall through - case 5: - inptr2 = zerobuff; - // fall through - case 4: - inptr3 = zerobuff; - // fall through - case 3: - inptr4 = zerobuff; - // fall through - case 2: - inptr5 = zerobuff; - // fall through - case 1: - inptr6 = zerobuff; - // fall through - case 0: - inptr7 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - if (first) { - if (x <= 7) { - break; - } - - first = false; - } - - int skippf = (x & 31); - __asm __volatile ( - // Load up 8 elements (1 vector) from each of 8 sources. - "CBNZ %w[skippf], 1f\n" - ASM_PREFETCH("[%[inptr0], #128]") - ASM_PREFETCH("[%[inptr1], #128]") - ASM_PREFETCH("[%[inptr2], #128]") - ASM_PREFETCH("[%[inptr3], #128]") - "1:\n" - - "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7 - "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7 - "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3... - "LDR q6, [%[inptr6]], #16\n" - "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3 - "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7 - "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3 - "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7 - "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7 - "LDR q5, [%[inptr5]], #16\n" - "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3.... - "LDR q7, [%[inptr7]], #16\n" - "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3 - "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7 - "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3 - "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7 - - "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1 - "ZIP2 v20.8h, v8.8h, v9.8h\n" - "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1 - "ZIP2 v21.8h, v10.8h, v11.8h\n" - - "CBNZ %w[skippf], 2f\n" - ASM_PREFETCH("[%[inptr4], #112]") - ASM_PREFETCH("[%[inptr5], #112]") - ASM_PREFETCH("[%[inptr6], #112]") - ASM_PREFETCH("[%[inptr7], #112]") - "2:\n" - - "ZIP1 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v30.8h, v16.8h, v17.8h\n" - "ZIP1 v23.8h, v18.8h, v19.8h\n" - "ZIP2 v31.8h, v18.8h, v19.8h\n" - - "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0 - "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1 - "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements - - "ZIP1 v0.8h, v20.8h, v21.8h\n" - "ZIP2 v1.8h, v20.8h, v21.8h\n" - "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v2.8h, v22.8h, v23.8h\n" - "ZIP2 v3.8h, v22.8h, v23.8h\n" - "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v4.8h, v30.8h, v31.8h\n" - "ZIP2 v5.8h, v30.8h, v31.8h\n" - "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : [skippf] "r" (skippf) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31", "memory" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp deleted file mode 100644 index 701d688af2..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) - -#include - -#include "../asmlib.hpp" - -template<> -template -inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint32_t *outptr = (uint32_t *)out; - const uint32_t *inptr = (uint32_t *)in; - bool first = true; - - uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop - - for (int y=y0; y7) || first;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - /* 'first' forces this to always run at least once, needed if the total size is <=7. */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - inptr1 = zerobuff; - // fall through - case 5: - inptr2 = zerobuff; - // fall through - case 4: - inptr3 = zerobuff; - // fall through - case 3: - inptr4 = zerobuff; - // fall through - case 2: - inptr5 = zerobuff; - // fall through - case 1: - inptr6 = zerobuff; - // fall through - case 0: - inptr7 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - if (first) { - if (x<=7) { - break; - } - - first = false; - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 - "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 - "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDP q8, q9, [%[inptr4]], #32\n" - "LDP q10, q11, [%[inptr5]], #32\n" - "LDP q12, q13, [%[inptr6]], #32\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDP q14, q15, [%[inptr7]], #32\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - ASM_PREFETCH("[%[inptr3], #128]") - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP1 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr6], #128]") - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - ASM_PREFETCH("[%[inptr7], #128]") - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ && !__ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp deleted file mode 100644 index 2546cc571a..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) - -#include - -#include "../asmlib.hpp" - -template<> -template -inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint8_t *outptr = reinterpret_cast(out); - const uint8_t *inptr = reinterpret_cast(in); - bool first = true; - - /* Helper functions to copy blocks about used for odd case. */ - class t { - public: - static inline void copy_4_inc(uint8_t *&out, const uint8_t *&in) { - uint32_t *out_word = reinterpret_cast(out); - const uint32_t *in_word = reinterpret_cast(in); - - *out_word++ = *in_word++; - - out = reinterpret_cast(out_word); - in = reinterpret_cast(in_word); - } - - static inline void copy_pad(uint8_t *&out, const uint8_t *&in, size_t count) { - for (unsigned int i=0; i<4; i++) { - if (i < count) { - *out++ = *in++; - } else { - *out++ = 0; - } - } - } - }; - - uint8_t zerobuff[64] = { 0 }; // 32 for asm loop plus up to 31 for overflow loop - - for (int y=y0; y31) || first;x-=32) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - /* 'first' forces this to always run at least once, needed if the total size is <=32. */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - inptr1 = zerobuff; - // fall through - case 5: - inptr2 = zerobuff; - // fall through - case 4: - inptr3 = zerobuff; - // fall through - case 3: - inptr4 = zerobuff; - // fall through - case 2: - inptr5 = zerobuff; - // fall through - case 1: - inptr6 = zerobuff; - // fall through - case 0: - inptr7 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - if (first) { - if (x<=31) { - break; - } - - first = false; - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 - "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 - "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDP q8, q9, [%[inptr4]], #32\n" - "LDP q10, q11, [%[inptr5]], #32\n" - "LDP q12, q13, [%[inptr6]], #32\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDP q14, q15, [%[inptr7]], #32\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - ASM_PREFETCH("[%[inptr3], #128]") - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP1 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr6], #128]") - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - ASM_PREFETCH("[%[inptr7], #128]") - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory" - ); - } - - // Copy any leftover blocks of 4 a complete block at a time. - for (;x>4;x-=4) { - t::copy_4_inc(outptr, inptr0); - t::copy_4_inc(outptr, inptr1); - t::copy_4_inc(outptr, inptr2); - t::copy_4_inc(outptr, inptr3); - t::copy_4_inc(outptr, inptr4); - t::copy_4_inc(outptr, inptr5); - t::copy_4_inc(outptr, inptr6); - t::copy_4_inc(outptr, inptr7); - } - - // Final block with padding, if any. - if (x > 0) { - t::copy_pad(outptr, inptr0, x); - t::copy_pad(outptr, inptr1, x); - t::copy_pad(outptr, inptr2, x); - t::copy_pad(outptr, inptr3, x); - t::copy_pad(outptr, inptr4, x); - t::copy_pad(outptr, inptr5, x); - t::copy_pad(outptr, inptr6, x); - t::copy_pad(outptr, inptr7, x); - } - } -} - -#endif // __aarch64__ && !__ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp deleted file mode 100644 index a342d6c3d1..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) - -#include - -#include "../asmlib.hpp" - -template<> -template<> -inline void TransformImpl<8, 1, false, 4, 2, false>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) { - float *outptr = out; - const __fp16 *inptr = in; - bool first = true; - - __fp16 zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop - - for (int y=y0; y7) || first;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - /* 'first' forces this to always run at least once, needed if the total size is <=7. */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - inptr1 = zerobuff; - // fall through - case 5: - inptr2 = zerobuff; - // fall through - case 4: - inptr3 = zerobuff; - // fall through - case 3: - inptr4 = zerobuff; - // fall through - case 2: - inptr5 = zerobuff; - // fall through - case 1: - inptr6 = zerobuff; - // fall through - case 0: - inptr7 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - if (first) { - if (x<=7) { - break; - } - - first = false; - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDR q0, [%[inptr0]], #16\n" - "LDR q2, [%[inptr1]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 - "FCVTL2 v3.4s, v2.8h\n" - "FCVTL v2.4s, v2.4h\n" - "FCVTL2 v5.4s, v4.8h\n" - "FCVTL v4.4s, v4.4h\n" - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDR q8, [%[inptr4]], #16\n" - "LDR q10, [%[inptr5]], #16\n" - "FCVTL2 v9.4s, v8.8h\n" - "FCVTL v8.4s, v8.4h\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDR q12, [%[inptr6]], #16\n" - "FCVTL2 v11.4s, v10.8h\n" - "FCVTL v10.4s, v10.4h\n" - "FCVTL2 v13.4s, v12.8h\n" - "FCVTL v12.4s, v12.4h\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - "LDR q14, [%[inptr7]], #16\n" - "FCVTL2 v15.4s, v14.8h\n" - "FCVTL v14.4s, v14.4h\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - ASM_PREFETCH("[%[inptr3], #128]") - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - "ZIP1 v17.4s, v3.4s, v7.4s\n" - ASM_PREFETCH("[%[inptr6], #128]") - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - ASM_PREFETCH("[%[inptr7], #128]") - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ && __ARM_FP16_ARGS diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp deleted file mode 100644 index 37344a82a9..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) - -#include -#include - -#include "../asmlib.hpp" - -template<> -template<> -inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(int16_t *out, const int8_t *in, int ldin, int y0, int ymax, int k0, int kmax) { - int16_t *outptr = out; - const int8_t *inptr = in; - bool first = true; - - int8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop - - for (int y=y0; y15) || first;x-=16) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - /* 'first' forces this to always run at least once, needed if the total size is <=7. */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - inptr1 = zerobuff; - // fall through - case 5: - inptr2 = zerobuff; - // fall through - case 4: - inptr3 = zerobuff; - // fall through - case 3: - inptr4 = zerobuff; - // fall through - case 2: - inptr5 = zerobuff; - // fall through - case 1: - inptr6 = zerobuff; - // fall through - case 0: - inptr7 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - if (first) { - if (x<=15) { - break; - } - - first = false; - } - - __asm __volatile ( - // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources. - "LDR q0, [%[inptr0]], #16\n" - "LDR q2, [%[inptr1]], #16\n" - "SSHLL2 v1.8h, v0.16b, #0\n" - "SSHLL v0.8h, v0.8b, #0\n" - "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 - "SSHLL2 v3.8h, v2.16b, #0\n" - "SSHLL v2.8h, v2.8b, #0\n" - "SSHLL2 v5.8h, v4.16b, #0\n" - "SSHLL v4.8h, v4.8b, #0\n" - "ZIP1 v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 - "SSHLL2 v7.8h, v6.16b, #0\n" - "SSHLL v6.8h, v6.8b, #0\n" - "ZIP1 v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1 - "LDR q8, [%[inptr4]], #16\n" - "LDR q10, [%[inptr5]], #16\n" - "SSHLL2 v9.8h, v8.16b, #0\n" - "SSHLL v8.8h, v8.8b, #0\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDR q12, [%[inptr6]], #16\n" - "SSHLL2 v11.8h, v10.16b, #0\n" - "SSHLL v10.8h, v10.8b, #0\n" - "SSHLL2 v13.8h, v12.16b, #0\n" - "SSHLL v12.8h, v12.8b, #0\n" - "ZIP1 v18.8h, v8.8h, v12.8h\n" - "LDR q14, [%[inptr7]], #16\n" - "SSHLL2 v15.8h, v14.16b, #0\n" - "SSHLL v14.8h, v14.8b, #0\n" - "ZIP1 v19.8h, v10.8h, v14.8h\n" - - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1 - "ZIP1 v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1 - "ZIP2 v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3 - "ZIP2 v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3 - ASM_PREFETCH("[%[inptr3], #128]") - - "ZIP2 v16.8h, v0.8h, v4.8h\n" - "ZIP2 v17.8h, v2.8h, v6.8h\n" - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP2 v18.8h, v8.8h, v12.8h\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP2 v19.8h, v10.8h, v14.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Write back the first element of each source - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - - "ZIP1 v20.8h, v16.8h, v17.8h\n" - "ZIP1 v21.8h, v18.8h, v19.8h\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP2 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v23.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v16.8h, v1.8h, v5.8h\n" - "ZIP1 v17.8h, v3.8h, v7.8h\n" - ASM_PREFETCH("[%[inptr6], #128]") - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP1 v18.8h, v9.8h, v13.8h\n" - "ZIP1 v19.8h, v11.8h, v15.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Third element - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - ASM_PREFETCH("[%[inptr7], #128]") - - "ZIP1 v20.8h, v16.8h, v17.8h\n" - "ZIP1 v21.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Fourth element - "ZIP2 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v23.8h, v18.8h, v19.8h\n" - - "ZIP2 v16.8h, v1.8h, v5.8h\n" - "ZIP2 v17.8h, v3.8h, v7.8h\n" - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP2 v18.8h, v9.8h, v13.8h\n" - "ZIP2 v19.8h, v11.8h, v15.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Fifth element - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - - "ZIP1 v20.8h, v16.8h, v17.8h\n" - "ZIP1 v21.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Sixth element - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP2 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v23.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Seventh element - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - "STP q24, q25, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ && __ARM_FP16_ARGS diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp deleted file mode 100644 index a3a269c9cd..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) - -#include -#include - -#include "../asmlib.hpp" - -template<> -template<> -inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(uint16_t *out, const uint8_t *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint16_t *outptr = out; - const uint8_t *inptr = in; - bool first = true; - - uint8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop - - for (int y=y0; y15) || first;x-=16) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - /* 'first' forces this to always run at least once, needed if the total size is <=7. */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - inptr1 = zerobuff; - // fall through - case 5: - inptr2 = zerobuff; - // fall through - case 4: - inptr3 = zerobuff; - // fall through - case 3: - inptr4 = zerobuff; - // fall through - case 2: - inptr5 = zerobuff; - // fall through - case 1: - inptr6 = zerobuff; - // fall through - case 0: - inptr7 = zerobuff; - break; - - default: - UNREACHABLE("Impossible."); - } - } - - if (first) { - if (x<=15) { - break; - } - - first = false; - } - - __asm __volatile ( - // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources. - "LDR q0, [%[inptr0]], #16\n" - "LDR q2, [%[inptr1]], #16\n" - "USHLL2 v1.8h, v0.16b, #0\n" - "USHLL v0.8h, v0.8b, #0\n" - "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 - "USHLL2 v3.8h, v2.16b, #0\n" - "USHLL v2.8h, v2.8b, #0\n" - "USHLL2 v5.8h, v4.16b, #0\n" - "USHLL v4.8h, v4.8b, #0\n" - "ZIP1 v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 - "USHLL2 v7.8h, v6.16b, #0\n" - "USHLL v6.8h, v6.8b, #0\n" - "ZIP1 v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1 - "LDR q8, [%[inptr4]], #16\n" - "LDR q10, [%[inptr5]], #16\n" - "USHLL2 v9.8h, v8.16b, #0\n" - "USHLL v8.8h, v8.8b, #0\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDR q12, [%[inptr6]], #16\n" - "USHLL2 v11.8h, v10.16b, #0\n" - "USHLL v10.8h, v10.8b, #0\n" - "USHLL2 v13.8h, v12.16b, #0\n" - "USHLL v12.8h, v12.8b, #0\n" - "ZIP1 v18.8h, v8.8h, v12.8h\n" - "LDR q14, [%[inptr7]], #16\n" - "USHLL2 v15.8h, v14.16b, #0\n" - "USHLL v14.8h, v14.8b, #0\n" - "ZIP1 v19.8h, v10.8h, v14.8h\n" - - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1 - "ZIP1 v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1 - "ZIP2 v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3 - "ZIP2 v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3 - ASM_PREFETCH("[%[inptr3], #128]") - - "ZIP2 v16.8h, v0.8h, v4.8h\n" - "ZIP2 v17.8h, v2.8h, v6.8h\n" - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP2 v18.8h, v8.8h, v12.8h\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP2 v19.8h, v10.8h, v14.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Write back the first element of each source - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - - "ZIP1 v20.8h, v16.8h, v17.8h\n" - "ZIP1 v21.8h, v18.8h, v19.8h\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP2 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v23.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v16.8h, v1.8h, v5.8h\n" - "ZIP1 v17.8h, v3.8h, v7.8h\n" - ASM_PREFETCH("[%[inptr6], #128]") - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP1 v18.8h, v9.8h, v13.8h\n" - "ZIP1 v19.8h, v11.8h, v15.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Third element - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - ASM_PREFETCH("[%[inptr7], #128]") - - "ZIP1 v20.8h, v16.8h, v17.8h\n" - "ZIP1 v21.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Fourth element - "ZIP2 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v23.8h, v18.8h, v19.8h\n" - - "ZIP2 v16.8h, v1.8h, v5.8h\n" - "ZIP2 v17.8h, v3.8h, v7.8h\n" - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP2 v18.8h, v9.8h, v13.8h\n" - "ZIP2 v19.8h, v11.8h, v15.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Fifth element - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - - "ZIP1 v20.8h, v16.8h, v17.8h\n" - "ZIP1 v21.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Sixth element - "TRN1 v24.2d, v20.2d, v21.2d\n" - "TRN2 v25.2d, v20.2d, v21.2d\n" - - "ZIP2 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v23.8h, v18.8h, v19.8h\n" - "STP q24, q25, [%[outptr]], #32\n" // Seventh element - "TRN1 v24.2d, v22.2d, v23.2d\n" - "TRN2 v25.2d, v22.2d, v23.2d\n" - "STP q24, q25, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ && __ARM_FP16_ARGS diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp index 5ab5774751..f6233ef503 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp @@ -30,12 +30,12 @@ // Generic unblocked transposed 6x32-bit sized specialisation template <> template -inline void TransformImpl<6, 1, true, 4, 4, false>::Transform( +inline void TransformImpl<6, 1, true, 4, 4, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { // Redirect to a 12 x uint16_t specialisation - TransformImpl<12, 1, true, 2, 2, false>::Transform( + TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform( reinterpret_cast(out), reinterpret_cast(in), stride*2, x0*2, xmax*2, k0, kmax @@ -45,7 +45,7 @@ inline void TransformImpl<6, 1, true, 4, 4, false>::Transform( // Generic 12x16-bit sized specialisation template <> template -inline void TransformImpl<12, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { @@ -135,7 +135,7 @@ inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(con template <> template <> -inline void TransformImpl<12, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform( uint16_t* out, const uint16_t* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp index d7de9ff934..c0f3e17d31 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp @@ -110,7 +110,7 @@ inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __ template <> template <> -inline void TransformImpl<12, 1, true, 4, 2, false>::Transform( +inline void TransformImpl<12, 1, true, 4, 2, VLType::None>::Transform( float* out, const __fp16* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp index a137f9360a..bcbe2b84d8 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp @@ -30,12 +30,12 @@ // Generic unblocked transposed 12x32-bit sized specialisation template <> template -inline void TransformImpl<12, 1, true, 4, 4, false>::Transform( +inline void TransformImpl<12, 1, true, 4, 4, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { // Redirect to a 24 x uint16_t specialisation - TransformImpl<24, 1, true, 2, 2, false>::Transform( + TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform( reinterpret_cast(out), reinterpret_cast(in), stride*2, x0*2, xmax*2, k0, kmax @@ -45,7 +45,7 @@ inline void TransformImpl<12, 1, true, 4, 4, false>::Transform( // Generic 24x16-bit sized specialisation template <> template -inline void TransformImpl<24, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { @@ -120,7 +120,7 @@ inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(con template <> template <> -inline void TransformImpl<24, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform( uint16_t* out, const uint16_t* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp index 974be481e7..df68740bb4 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp @@ -30,12 +30,12 @@ // Generic unblocked transposed 8x32-bit sized specialisation template <> template -inline void TransformImpl<8, 1, true, 4, 4, false>::Transform( +inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { // Redirect to a 16 x uint16_t specialisation - TransformImpl<16, 1, true, 2, 2, false>::Transform( + TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( reinterpret_cast(out), reinterpret_cast(in), stride*2, x0*2, xmax*2, k0, kmax @@ -45,7 +45,7 @@ inline void TransformImpl<8, 1, true, 4, 4, false>::Transform( // Generic 16x16-bit sized specialisation template <> template -inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( T* out, const T* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { @@ -137,7 +137,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con template <> template <> -inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( +inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform( uint16_t* out, const uint16_t* const in, const int stride, const int x0, const int xmax, const int k0, const int kmax ) { diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp index b825e1c358..e092c729ba 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -21,22 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "a32_interleave_6way_32bit.hpp" #include "a32_transpose_interleave_8way_32bit.hpp" -#include "a64_block16_interleave4_8bit.hpp" -#include "a64_interleave_8way_16bit.hpp" -#include "a64_interleave_8way_32bit.hpp" -#include "a64_interleave_8way_block4_8bit.hpp" -#include "a64_interleave_8way_half_to_float.hpp" -#include "a64_interleave_8way_s8_to_s16.hpp" -#include "a64_interleave_8way_u8_to_u16.hpp" #include "a64_transpose_interleave_12way_16bit.hpp" #include "a64_transpose_interleave_12way_half_to_float.hpp" #include "a64_transpose_interleave_24way_16bit.hpp" #include "a64_transpose_interleave_8way_32bit.hpp" -#include "sve_interleave_8way_32bit.hpp" -#include "sve_interleave_8way_block2_16bit.hpp" -#include "sve_interleave_8way_block2_32bit.hpp" -#include "sve_interleave_8way_block4_16bit.hpp" -#include "sve_interleave_8way_block4_8bit.hpp" -#include "sve_interleave_8way_block8_8bit.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp deleted file mode 100644 index 348d78e3f5..0000000000 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp +++ /dev/null @@ -1,596 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __ARM_FEATURE_SVE - -template<> -template -inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint32_t *master_outptr = reinterpret_cast(out); - const uint32_t *inptr = reinterpret_cast(in); - - for (int y=y0; y -template -inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint16_t *master_outptr = reinterpret_cast(out); - const uint16_t *inptr = reinterpret_cast(in); - - for (int y=y0; y -template -inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint32_t *master_outptr = reinterpret_cast(out); - const uint32_t *inptr = reinterpret_cast(in); - - for (int y=y0; y -template -inline void TransformImpl<8, 4, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint16_t *master_outptr = reinterpret_cast(out); - const uint16_t *inptr = reinterpret_cast(in); - - for (int y=y0; y -template -inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint8_t *master_outptr = reinterpret_cast(out); - const uint8_t *inptr = reinterpret_cast(in); - - for (int y=y0; y -template -inline void TransformImpl<8, 8, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint8_t *master_outptr = reinterpret_cast(out); - const uint8_t *inptr = reinterpret_cast(in); - - for (int y=y0; y struct TransposeInterleaveCommon { // Override the moveblock_1xY methods to improve performance diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index 6e47a97c78..6d483a3b9d 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -24,6 +24,8 @@ #pragma once +#include "arm_gemm.hpp" + #include // Macro for unreachable code (e.g. impossible default cases on switch) @@ -32,6 +34,8 @@ // Paranoid option for the above with assert // #define UNREACHABLE(why) assert(0 && why) +namespace arm_gemm { + template inline T iceildiv(const T a, const T b) { return (a + b - 1) / b; @@ -48,7 +52,94 @@ inline T roundup(const T a, const T b) { } } -namespace arm_gemm { +enum class VLType { + None, + SVE, +}; + +template +struct IndirectOutputArg { + struct { + T *base; + size_t stride; + } direct = {}; + struct { + T * const *ptr; + size_t offset; + } indirect = {}; + bool is_indirect; + + // Direct + IndirectOutputArg(T *base, size_t stride) : is_indirect(false) { + direct.base = base; + direct.stride = stride; + } + + // Indirect + IndirectOutputArg(T * const * ptr, size_t offset) : is_indirect(true) { + indirect.ptr = ptr; + indirect.offset = offset; + } + + IndirectOutputArg() : is_indirect(false) { + direct.base = nullptr; + direct.stride = 0; + } +}; + +// Check that the provided Requantize32 doesn't have a left shift. +inline bool quant_no_left_shift(const Requantize32 &qp) { + if (qp.per_channel_requant) { + return (qp.per_channel_left_shifts == nullptr); + } else { + return (qp.per_layer_left_shift == 0); + } +} + +// Check that the provided Requantize32 is compatible with the "symmetric" hybrid kernels. These don't include row +// sums, so the 'b_offset' has to be zero. +inline bool quant_hybrid_symmetric(const Requantize32 &qp) { + return quant_no_left_shift(qp) && qp.b_offset == 0; +} + +// Check that the provided Requantize32 is compatible with the "asymmetric" hybrid kernels. These don't support per +// channel quantization. Technically b_offset==0 cases would work, but it is a waste to sum and then multiply by 0... +inline bool quant_hybrid_asymmetric(const Requantize32 &qp) { + return quant_no_left_shift(qp) /* && qp.b_offset != 0 */ && qp.per_channel_requant==false; +} + +template +struct IndirectInputArg { + struct { + const T *base; + size_t stride; + } direct = {}; + struct { + const T * const * const * ptr; + unsigned int start_row; + unsigned int start_col; + } indirect = {}; + bool is_indirect; + + // Direct + IndirectInputArg(const T *base, size_t stride) : is_indirect(false) { + direct.base = base; + direct.stride = stride; + } + + // Indirect + IndirectInputArg(const T * const * const *ptr, unsigned int start_row, unsigned int start_col) : is_indirect(true) { + indirect.ptr = ptr; + indirect.start_row = start_row; + indirect.start_col = start_col; + } + + IndirectInputArg() : is_indirect(false) { + direct.base = nullptr; + direct.stride = 0; + } +}; + namespace utils { namespace { diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp deleted file mode 100644 index 760274dba1..0000000000 --- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/WindowIterator.h" - -using namespace arm_compute; - -INEGEMMWrapperKernel::INEGEMMWrapperKernel() - : _a(nullptr), _b(nullptr), _c(nullptr), _params(), _gemm_info(), _window3d(), _window_shape() -{ -} - -INEGEMMWrapperKernel::Params INEGEMMWrapperKernel::extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info) -{ - Params p; - - ARM_COMPUTE_ERROR_ON_NULLPTR(a); - ARM_COMPUTE_ERROR_ON_NULLPTR(b); - ARM_COMPUTE_ERROR_ON_NULLPTR(c); - - // Initalize params - p.M = c->info()->tensor_shape().y(); - p.N = c->info()->tensor_shape().x(); - p.K = a->info()->tensor_shape().x(); - p.multis = b->info()->tensor_shape().z(); - p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs - - // Update M in case of GEMM3D for output - if(gemm_info.depth_output_gemm3d() != 0) - { - p.M = c->info()->tensor_shape().y() * c->info()->tensor_shape().z(); - p.batches = c->info()->tensor_shape().total_size_upper(3) / p.multis; - } - - return p; -} - -void INEGEMMWrapperKernel::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info) -{ - _gemm_info = gemm_info; - _params = extract_parameters(a, b, c, gemm_info); - _a = a; - _b = b; - _c = c; - - _window3d = configure_internal(alpha, beta); - _window_shape = _window3d.shape(); - - // Convert the 3D window into a 1D window in order to allow the scheduler to arbitrary split it. - Window collapsed; - collapsed.set(0, Window::Dimension(0, _window3d.num_iterations_total())); - - INEKernel::configure(collapsed); -} - -void INEGEMMWrapperKernel::run(const Window &window, const ThreadInfo &info) -{ - const Coordinates start_offset = index2coords(_window_shape, window.x().start()); - const Coordinates end_offset = index2coords(_window_shape, window.x().end() - 1); - - run_internal(_window3d, start_offset, end_offset, info); -} diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h deleted file mode 100644 index 92c013260b..0000000000 --- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_INEGEMMWRAPPERKERNEL_H -#define SRC_INEGEMMWRAPPERKERNEL_H - -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Common interface for all the arm_gemm Gemms - */ -class INEGEMMWrapperKernel : public INEKernel -{ -public: - /** Parameters defining the dimensions of the matrices being multiplied */ - struct Params - { - unsigned int M{ 0 }; /**< Rows in output matrix C (and input matrix A). */ - unsigned int N{ 0 }; /**< Columns in output matrix C (and input matrix B). */ - unsigned int K{ 0 }; /**< Columns of input matrix A (= rows of input matrix B). */ - unsigned int batches{ 0 }; /**< Number of "batched" GEMMs (unique A and C, shared B). */ - unsigned int multis{ 0 }; /**< Number of "multi" GEMMs (unique A, B and C). */ - }; - - static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info); - - /** Constructor */ - INEGEMMWrapperKernel(); - /** Prevent instances of this class from being copied */ - INEGEMMWrapperKernel(const INEGEMMWrapperKernel &) = delete; - /** Prevent instances of this class from being copied */ - INEGEMMWrapperKernel &operator=(const INEGEMMWrapperKernel &) = delete; - /** Allow instances of this class to be moved */ - INEGEMMWrapperKernel(INEGEMMWrapperKernel &&) = default; - /** Allow instances of this class to be moved */ - INEGEMMWrapperKernel &operator=(INEGEMMWrapperKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @note The input and output tensor must have the same dimensions - * - * @param[in] a Input tensor (Matrix A) - * @param[in] b Input tensor (Matrix B) - * @param[out] c Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] alpha Scalar multiplier to apply to AB matrix product. - * @param[in] beta Scalar multiplier to apply to input C matrix before adding product. - * @param[in] gemm_info GEMM meta-data - */ - void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - /** Called as part of configure() after _a, _b, _c and _params have been set. - * - * @param[in] alpha Scalar multiplier to apply to AB matrix product. - * @param[in] beta Scalar multiplier to apply to input C matrix before adding product. - * - * @return A 3D execution window. - */ - virtual Window configure_internal(float alpha, float beta) = 0; - - /** Run the kernel from the start to the end offset in window. - * - * @param[in] window Window to use for the iteration - * @param[in] start_offset Where to start iterating from (In Window coordinates) - * @param[in] end_offset Where to stop iterating (In Window coordinates). - * @param[in] info Info about executing thread and CPU. - */ - virtual void run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info) = 0; - - const ITensor *_a; - const ITensor *_b; - ITensor *_c; - Params _params; - GEMMInfo _gemm_info; - -private: - Window _window3d; - TensorShape _window_shape; -}; - -} // namespace arm_compute - -#endif /* SRC_INEGEMMRAPPERKERNEL_H */ diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp index f6421c12ab..3088b080d6 100644 --- a/src/core/NEON/kernels/assembly/arm_gemm.hpp +++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp @@ -43,7 +43,9 @@ enum class GemmMethod GEMM_INTERLEAVED_2D, QUANTIZE_WRAPPER, QUANTIZE_WRAPPER_2D, - GEMM_HYBRID_QUANTIZED + GEMM_HYBRID_QUANTIZED, + INDIRECT_GEMM, + CONVOLUTION_GEMM }; struct KernelDescription @@ -104,17 +106,19 @@ public: unsigned int _Msize; unsigned int _Nsize; unsigned int _Ksize; + unsigned int _Ksections; unsigned int _nbatches; unsigned int _nmulti; + bool _indirect_input; Activation _act; int _maxthreads; const GemmConfig *_cfg; - GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N, - const unsigned int K, const unsigned int nbatches, - const unsigned int nmulti, Activation act, const int maxthreads, + GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N, + unsigned int K, unsigned int Ksections, unsigned int nbatches, + unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads, const GemmConfig *cfg = nullptr) - : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _act(act), _maxthreads(maxthreads), _cfg(cfg) + : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg) { } }; @@ -143,8 +147,8 @@ public: Requantize32(const int32_t *bias, size_t bias_multi_stride, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max(requant_shift, int32_t(0))), - per_layer_right_shift(std::min(requant_shift, int32_t(0))), per_layer_mul(requant_mul), minval(minv), maxval(maxv) + : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max(requant_shift, 0)), + per_layer_right_shift(std::min(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv) { } diff --git a/src/core/NEON/kernels/assembly/convolution_parameters.hpp b/src/core/NEON/kernels/assembly/convolution_parameters.hpp new file mode 100644 index 0000000000..d0ef5b539f --- /dev/null +++ b/src/core/NEON/kernels/assembly/convolution_parameters.hpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +namespace arm_gemm +{ +/* + * Parameter set for "convolution" type GEMM. + * + * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if + * an im2row had been performed on the input tensor to generate the operand + * matrix, but instead this structure describes the convolution parameters + * such that this can be done on the fly. + * + * The parameters describe the convolution details - the notional shape of + * the input and output tensors, whether padding is to be applied, the size + * of the kernel and a constant value to be used for padding (needed for + * quantized tensors). + * + * The second part describes the layout of the input tensor in memory, which + * is assumed to be in NHWC format. This consists of a base pointer and + * strides for columns, rows and batches. 'multis' are not supported for + * convolution type GEMMs. + */ +struct ConvolutionParameters +{ + int64_t input_width; + int64_t input_height; + int64_t input_channels; + int64_t kernel_width; + int64_t kernel_height; + int64_t output_width; + int64_t output_height; + int64_t output_stride_w; + int64_t output_stride_h; + // output_channels not included as they do not affect the input. + int64_t padding_top; + int64_t padding_left; + float padding_value; +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp index e9e56842c7..e1fb7a45a8 100644 --- a/src/core/NEON/kernels/assembly/gemm_common.hpp +++ b/src/core/NEON/kernels/assembly/gemm_common.hpp @@ -23,6 +23,7 @@ */ #pragma once +#include "convolution_parameters.hpp" #include "ndrange.hpp" #include @@ -77,7 +78,7 @@ public: return false; } - /** Main execute member function + /** Main execute member fucntion * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() * @param [in] thread_locator where are we inside of the thread space * @naram [in] threadid a unique threadid @@ -123,6 +124,19 @@ public: { } + /*** Indirect interface (optional) ***/ + /* Set the indirect table. This comprises a number of values per kernel point, and a densely packed array of pointers, + * multis * batches * kernel_points */ + virtual void set_indirect_parameters_generic(size_t, const void *const *const *) + { + } + + /*** Convolution interface (optional) ***/ + /* Set the convolution parameters. */ + virtual void set_convolution_parameters(ConvolutionParameters) + { + } + // Destructor virtual ~IGemmCommon() { @@ -200,6 +214,16 @@ public: { pretranspose_B_array(out, static_cast(in), row_stride, multi_stride); } + + /*** Indirect interface ***/ + virtual void set_indirect_parameters(size_t, const To *const *const *) + { + } + + void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override + { + set_indirect_parameters(sz, reinterpret_cast(ptr)); + } }; } // namespace arm_gemm diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 901b1e880e..cc5f160787 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -27,27 +27,12 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NECol2ImKernel.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NECopyKernel.h" -#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" -#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h" -#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" -#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" -#include "src/core/NEON/kernels/NEFFTScaleKernel.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/core/NEON/kernels/NEIm2ColKernel.h" -#include "src/core/NEON/kernels/NEPadLayerKernel.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" -#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" + #include "support/MemorySupport.h" #include @@ -71,6 +56,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups)); + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: @@ -87,6 +73,13 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const _function = std::move(f); break; } + case ConvolutionMethod::GEMM_CONV2D: + { + auto f = arm_compute::support::cpp14::make_unique(_memory_manager); + f->configure(input, weights, biases, output, info); + _function = std::move(f); + break; + } case ConvolutionMethod::DIRECT: { auto f = arm_compute::support::cpp14::make_unique(_memory_manager); @@ -112,22 +105,22 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo { ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON"); + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - //Validate Winograd ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); break; case ConvolutionMethod::GEMM: - //Validate Gemm-based Convolution ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info)); break; + case ConvolutionMethod::GEMM_CONV2D: + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConv2d::validate(input, weights, biases, output, info)); + break; case ConvolutionMethod::DIRECT: - //Validate Direct Convolution ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); break; case ConvolutionMethod::FFT: - // Validate FFT-based convolution layer ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)); break; default: @@ -149,6 +142,8 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo * const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1); + /* Input spatial dims, kernel size, IFM/OFM, conv info*/ using ConvolutionConfiguration = std::tuple; using ConfigurationMethod = std::pair; @@ -235,7 +230,21 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo * } } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; + // For 1x1 convolutions run the default GEMM + if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) + { + return ConvolutionMethod::GEMM; + } + + if(bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) + { + return ConvolutionMethod::WINOGRAD; + } + if(bool(NEGEMMConv2d::validate(input, weights, nullptr, output, info))) + { + return ConvolutionMethod::GEMM_CONV2D; + } + return ConvolutionMethod::GEMM; } } diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 0215098792..9f52e458d2 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -47,7 +47,19 @@ using namespace arm_compute::misc::shape_calculator; namespace arm_compute { -NEGEMM::~NEGEMM() = default; +namespace +{ +AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) +{ + AsmGemmInfo asm_info; + asm_info.method = AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + + return asm_info; +} +} // namespace NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(), @@ -56,12 +68,15 @@ NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager * { } +NEGEMM::~NEGEMM() = default; + void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); - const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); - bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info)); + const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); + bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info)); // Check if we need to reshape the matrix B only on the first run _is_prepared = false; @@ -76,7 +91,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe if(run_optimised) { const ITensor *c_to_use = is_c_bias ? c : nullptr; - _asm_glue.configure(a, b, c_to_use, d, gemm_info); + _asm_glue.configure(a, b, c_to_use, d, asm_info); ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured()); // Scale product by alpha @@ -221,7 +236,8 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso } // Check if we need to run the optimized assembly kernel - const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info)); + AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info)); if(!run_optimised) { diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index 5b0848398d..400fa64438 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -25,18 +25,70 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" #include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h" #include "src/core/NEON/kernels/assembly/arm_gemm.hpp" #include "support/MemorySupport.h" #include +#include namespace arm_compute { namespace { +struct free_delete +{ + void operator()(void *x) + { + free(x); + } +}; + +struct Params +{ + unsigned int M; + unsigned int N; + unsigned int K; + unsigned int batches; + unsigned int multis; + unsigned int sections; + bool indirect; +}; + +Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + + Params p; + p.K = a->info()->tensor_shape().x(); + p.N = d->info()->tensor_shape().x(); + p.multis = 1; + p.indirect = false; + p.sections = 1; + + if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) + { + p.indirect = true; + p.sections = b->info()->tensor_shape()[2] * b->info()->tensor_shape()[3]; + } + else + { + p.M = d->info()->tensor_shape().y(); + p.multis = b->info()->tensor_shape().z(); + p.batches = d->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs + } + + // Update M in case of GEMM3D for output + if(info.depth_output_gemm3d != 0) + { + p.M = d->info()->tensor_shape().y() * d->info()->tensor_shape().z(); + p.batches = d->info()->tensor_shape().total_size_upper(3) / p.multis; + } + + return p; +} + arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) { arm_gemm::Activation gemm_act; @@ -69,6 +121,29 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) return gemm_act; } +IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type) +{ + // Schedule assembly kernel + const int granule_threshold = 200; + IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); + if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) + { + scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); + } + else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8)) + { + //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions + scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + } + else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) + { + //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case + scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + } + + return scheduling_hint; +} + template class FallbackTransform : public ITransformWeights { @@ -165,7 +240,7 @@ public: * @param[in] os Output stage meta-data. */ void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, - arm_gemm::GemmArgs args, const GEMMInfo &gemm_info, + arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); /** Set requantization shifts to be used @@ -198,6 +273,16 @@ private: * @param[in] alignment Workspace memory alignment. */ void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment); + /** Configure the indirect buffer + * + * @param[in] a Input tensor containing the Matrix A. + * @param[in] b Input tensor containing the Matrix B. + * @param[out] d Output tensor to store the result of matrix multiplication. + * @param[in] info GEMM meta-data + */ + void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info); + /** Prepare the indirect buffer */ + void prepare_indirect_buffer(); /** Assembly Gemm kernel */ std::shared_ptr> _gemm_kernel_asm{ nullptr }; @@ -226,7 +311,7 @@ private: /** Prepared flag */ bool _is_prepared{ false }; /** GEMM meta-data */ - GEMMInfo _gemm_info{}; + AsmGemmInfo _gemm_info{}; /** Weights manager */ IWeightsManager *_weights_manager{ nullptr }; /** Weights transform object */ @@ -239,11 +324,16 @@ private: std::vector left_shifts{}; /** Per channel quantization multipliers */ std::vector _multipliers{}; + /** Indirect buffer */ + std::unique_ptr _indirect_arg{}; + std::unique_ptr _indirect_buf{}; + std::vector _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; }; template -std::tuple Fallback::set_requantize_data(const std::vector &shifts, - const std::vector &multipliers) +std::tuple +Fallback::set_requantize_data(const std::vector &shifts, const std::vector &multipliers) { _multipliers = multipliers; _shifts = shifts; @@ -260,9 +350,123 @@ std::tuple Fallback +void Fallback::prepare_indirect_buffer() +{ + const TypeInput *A_ptr = reinterpret_cast(_a->buffer()); + const int multis = 1; + const int batches = _a->info()->tensor_shape().total_size_upper(3); + const size_t stride_A = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); + const size_t batch_stride_A = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput); + const size_t multi_stride_A = _a->info()->strides_in_bytes()[4] / sizeof(TypeInput); + + const size_t output_hw = _cp.output_height * _cp.output_width; + const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput); + const size_t batch_stride = batch_size / sizeof(TypeInput); + const int multi_size = batch_size * batches; + const size_t multi_stride = multi_size / sizeof(TypeInput); + + for(int64_t m = 0; m < multis; m++) + { + for(int64_t b = 0; b < batches; b++) + { + for(int64_t output_y = 0; output_y < _cp.output_height; output_y++) + { + for(int64_t output_x = 0; output_x < _cp.output_width; output_x++) + { + int64_t output_xy = (output_y * _cp.output_width) + output_x; + + for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) + { + for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) + { + int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; + int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; + int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; + int64_t input_xy = (input_y * _cp.input_width) + input_x; + + if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) + { + _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data(); + } + else + { + _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); + } + } + } + } + } + } + } +} + +template +void Fallback::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); + + float zeropad = 0.f; + if(is_data_type_quantized(a->data_type())) + { + zeropad = a->quantization_info().uniform().offset; + } + + const int64_t input_width = static_cast(a->tensor_shape()[1]); + const int64_t input_height = static_cast(a->tensor_shape()[2]); + const int64_t input_channels = static_cast(a->tensor_shape()[0]); + const int64_t kernel_width = static_cast(b->tensor_shape()[2]); + const int64_t kernel_height = static_cast(b->tensor_shape()[3]); + const int64_t output_width = static_cast(d->tensor_shape()[1]); + const int64_t output_height = static_cast(d->tensor_shape()[2]); + + _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height, + info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad + }; + + if(info.method == AsmConvMethod::Conv) + { + _gemm_kernel_asm->set_convolution_parameters(_cp); + } + + if(info.method == AsmConvMethod::Indirect) + { + const unsigned int multis = 1; + const unsigned int batches = a->tensor_shape().total_size_upper(3); + const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height; + const unsigned int output_hw = _cp.output_width * _cp.output_height; + + using TypeInputPtr = TypeInput *; + const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr); + const size_t batch_stride = batch_size / sizeof(TypeInputPtr); + const int multi_size = batch_size * batches; + const size_t multi_stride = multi_size / sizeof(TypeInputPtr); + + _indirect_buf = std::unique_ptr(reinterpret_cast(malloc(multi_size * multis))); + _indirect_arg = std::unique_ptr(reinterpret_cast(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); + _indirect_pad = std::vector(_cp.input_channels, zeropad); + + // Set indirect argument + int64_t pos = 0; + for(int64_t m = 0; m < multis; m++) + { + for(int64_t b = 0; b < batches; b++) + { + for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) + { + (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; + } + } + } + + _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get()); + } +} + template void Fallback::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, - arm_gemm::GemmArgs args, const GEMMInfo &gemm_info, + arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) { arm_gemm::GemmConfig gemm_cfg; @@ -325,6 +529,12 @@ void Fallback::configure(const ITensor *a, c static_cast(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment); } } + + // Handle indirect GEMM convolution + if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) + { + configure_indirect(a->info(), b->info(), d->info(), gemm_info); + } } template @@ -365,6 +575,11 @@ void Fallback::prepare() } } + if(_gemm_info.method == AsmConvMethod::Indirect) + { + prepare_indirect_buffer(); + } + _is_prepared = true; } } @@ -387,23 +602,23 @@ bool Fallback::is_configured() const template void Fallback::run() { - const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); + int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); int ldb = 0; const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput); - const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d() != 0 ? 3 : 2; + const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2; const size_t a_multi_idx = a_batch_idx + 1; - const size_t d_batch_idx = _gemm_info.depth_output_gemm3d() != 0 ? 3 : 2; + const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2; const size_t d_multi_idx = d_batch_idx + 1; - const int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput); + int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput); const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput); - const int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput); + int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput); int multi_stride_b = 0; const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput); - const auto in0_ptr = reinterpret_cast(_a->buffer() + _a->info()->offset_first_element_in_bytes()); + auto in0_ptr = reinterpret_cast(_a->buffer() + _a->info()->offset_first_element_in_bytes()); const TypeInput *in1_ptr = nullptr; auto out_ptr = reinterpret_cast(_d->buffer() + _d->info()->offset_first_element_in_bytes()); @@ -415,25 +630,7 @@ void Fallback::run() in1_ptr = reinterpret_cast(_b->buffer() + _b->info()->offset_first_element_in_bytes()); } - IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); - if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32) - { - const int granule_threshold = 200; - scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); - } - else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (_d->info()->data_type() == DataType::F32 || _d->info()->data_type() == DataType::F16 - || _d->info()->data_type() == DataType::U8 || _d->info()->data_type() == DataType::S8)) - { - //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions - const int granule_threshold = 200; - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); - } - else if(_kernel_info.method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (_d->info()->data_type() == DataType::QASYMM8 || _d->info()->data_type() == DataType::QASYMM8_SIGNED)) - { - //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case - const int granule_threshold = 200; - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); - } + const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, _d->info()->data_type()); // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads if(_workspace.buffer() != nullptr) @@ -458,57 +655,67 @@ void Fallback::run() // Prepare assembly kernel prepare(); - TypeOutput *bias = nullptr; // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. + TypeOutput *bias = nullptr; if(_c && _c->info()->data_type() != DataType::S32) { bias = reinterpret_cast(_c->buffer() + _c->info()->offset_first_element_in_bytes()); } + + if(_gemm_info.method == AsmConvMethod::Indirect) + { + in0_ptr = nullptr; + lda = 0; + batch_stride_a = 0; + multi_stride_a = 0; + } + // Set gemm parameters _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d, bias, 0); - // Schedule assembly kernel + // Schedule NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); } template void create_arm_gemm(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, - const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info, + const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { - INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + unsigned int num_threads = NEScheduler::get().num_threads(); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); // Create arm_gemm fallback auto fallback = support::cpp14::make_unique>(); - fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager); + fallback->configure(a, b, c, d, args, info, memory_group, weights_manager); arm_gemm = std::move(fallback); } template void create_arm_gemm_quant(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, - const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info, + const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { ARM_COMPUTE_UNUSED(activation); - INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + unsigned int num_threads = NEScheduler::get().num_threads(); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); // Create arm_gemm fallback auto fallback = support::cpp14::make_unique>(); // Configure requantization info - const int32_t a_offset = -a->info()->quantization_info().uniform().offset; - const int32_t b_offset = -b->info()->quantization_info().uniform().offset; - const GEMMLowpOutputStageInfo os_info = gemm_info.gemmlowp_output_stage(); + const int32_t negation = info.negated_offsets ? 1 : -1; + const int32_t a_offset = -a->info()->quantization_info().uniform().offset * negation; + const int32_t b_offset = -b->info()->quantization_info().uniform().offset * negation; + const GEMMLowpOutputStageInfo os_info = info.output_stage; arm_gemm::Requantize32 gemm_requant_info{}; if(os_info.gemmlowp_shifts.size() > 1) @@ -530,7 +737,7 @@ void create_arm_gemm_quant(std::unique_ptr &a } // Configure fallback - fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info); + fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info); arm_gemm = std::move(fallback); } @@ -541,14 +748,13 @@ NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr m { } -Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info) +Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) { - ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_UNUSED(c, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.pretranpose_B()); #ifndef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); #endif /* __aarch64__ */ @@ -579,13 +785,13 @@ bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo & return act.type != arm_gemm::Activation::Type::None; } -void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info) +void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info()); + arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info); //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info)) + if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info)) { return; } @@ -593,40 +799,40 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const switch(a->info()->data_type()) { case DataType::F32: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: if(d->info()->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } else { - create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: if(d->info()->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } else { - create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } break; #endif /* __aarch64__ */ #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) case DataType::BFLOAT16: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); break; #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp new file mode 100644 index 0000000000..642b084fb4 --- /dev/null +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include +namespace arm_compute +{ +namespace +{ +GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act) +{ + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo iqinfo = input->quantization_info(); + const QuantizationInfo wqinfo = weights->quantization_info(); + const QuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = input->data_type(); + // Merge activation with output stage + const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU + }; + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get(); + int32_t max_activation = type_max.get(); + if(supported_acts.count(act.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); + } + GEMMLowpOutputStageInfo os_info; + os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + os_info.gemmlowp_offset = uoqinfo.offset; + os_info.gemmlowp_min_bound = min_activation; + os_info.gemmlowp_max_bound = max_activation; + os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info); + return os_info; +} +AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect) +{ + AsmGemmInfo asm_info; + asm_info.method = is_indirect ? AsmConvMethod::Indirect : AsmConvMethod::Conv; + asm_info.ps_info = info.conv_info; + asm_info.activation_info = info.act_info; + asm_info.depth_output_gemm3d = true; + asm_info.reinterpret_input_as_3d = true; + asm_info.padding_top = info.conv_info.pad_top(); + asm_info.padding_left = info.conv_info.pad_left(); + asm_info.padding_value = 0.f; + asm_info.negated_offsets = false; + return asm_info; +} +} // namespace + +NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) + : _gemm_asm_func(memory_manager), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false), _run_activation(false) +{ +} +void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConv2d::validate(input->info(), + weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), + info)); + _original_weights = weights; + _weights_permute_func.configure(weights, &_permuted_weights, PermutationVector{ 3, 0, 1, 2 }); + + // Configure assembly dispatch + AsmGemmInfo asm_info = init_assembly_metadata(info, false); + if(is_data_type_quantized(input->info()->data_type())) + { + asm_info.output_stage = calculate_output_stage_metadata(input->info(), weights->info(), output->info(), info.act_info); + } + _gemm_asm_func.configure(input, &_permuted_weights, biases, output, asm_info); + + // Configure activation + if(info.act_info.enabled() && !_gemm_asm_func.is_activation_supported(info.act_info)) + { + _activation_func.configure(output, nullptr, info.act_info); + _run_activation = true; + } +} +Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC"); + const DataType data_type = input->data_type(); + const TensorShape i_shape = input->tensor_shape(); + const TensorShape w_shape = weights->tensor_shape(); + ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + // Validate biases + if(biases != nullptr) + { + if(is_data_type_quantized_asymmetric(data_type)) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else if(data_type == DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + AsmGemmInfo asm_info = init_assembly_metadata(info, false); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMAssemblyDispatch::validate(input, weights, biases, output, asm_info)); + return Status{}; +} +void NEGEMMConv2d::run() +{ + prepare(); + + _gemm_asm_func.run(); + if(_run_activation) + { + _activation_func.run(); + } +} +void NEGEMMConv2d::prepare() +{ + if(!_is_prepared) + { + _permuted_weights.allocator()->allocate(); + _weights_permute_func.run(); + _original_weights->mark_as_unused(); + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp deleted file mode 100644 index 09637dd2d6..0000000000 --- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "support/MemorySupport.h" - -namespace arm_compute -{ -NEGEMMLowpAssemblyMatrixMultiplyCore::~NEGEMMLowpAssemblyMatrixMultiplyCore() = default; - -NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b() -{ -} - -void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - - bool run_optimised = false; - switch(a->info()->data_type()) - { - case DataType::S8: - case DataType::QASYMM8: - case DataType::U8: - { - _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true)); - run_optimised = _asm_glue.is_configured(); - break; - } - default: - { - ARM_COMPUTE_ERROR("Datatype not supported"); - break; - } - } - if(!run_optimised) - { - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->info()->tensor_shape(); - shape_tmp_a.set(0, a->info()->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->info()->tensor_shape(); - shape_tmp_b.set(0, b->info()->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f)); - - TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type()); - TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); - _tmp_a.allocator()->init(info_a); - _tmp_b.allocator()->init(info_b); - _memory_group.manage(&_tmp_a); - _memory_group.manage(&_tmp_b); - - // Configure interleave kernel - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a, &_tmp_a); - _mtx_a_reshape_kernel = std::move(k); - } - - // Configure transpose kernel - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(b, &_tmp_b); - _mtx_b_reshape_kernel = std::move(k); - } - - // Configure matrix multiply kernel - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(&_tmp_a, &_tmp_b, output); - _mm_kernel = std::move(k); - } - - // Allocate tensors - _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); - } -} - -void NEGEMMLowpAssemblyMatrixMultiplyCore::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - if(_mtx_a_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); - } - - if(_mtx_b_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - } - - if(_asm_glue.is_configured()) - { - _asm_glue.run(); - } - else - { - NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); - } -} -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 9050427b34..df8eaacf47 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -47,6 +47,21 @@ namespace arm_compute { +namespace +{ +AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) +{ + AsmGemmInfo asm_info; + asm_info.method = AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.output_stage = info.gemmlowp_output_stage(); + + return asm_info; +} +} // namespace + using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default; @@ -120,6 +135,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _mm_result_s32.allocator()->init(info_mm_result_s32); } + // Initialize assembly kernel meta-data + const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); #ifdef __aarch64__ switch(a->info()->data_type()) { @@ -130,12 +147,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, { if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - _asm_glue.configure(a_to_use, b, c, output, gemm_info); + _asm_glue.configure(a_to_use, b, c, output, asm_info); _fused_assembly_path = _asm_glue.is_configured(); } else { - _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info); + _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, asm_info); } _assembly_path = _asm_glue.is_configured(); break; @@ -346,17 +363,20 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso matrix_a_info = &signed_a; } + // Initialize assembly kernel meta-data + const AsmGemmInfo asm_info = init_assembly_metadata(info); + // Check if we need to run the optimized assembly kernel bool run_optimised = false; bool run_optimised_requantized = false; if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info)); + run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); run_optimised_requantized = run_optimised; } else { - run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info)); + run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); } if(run_optimised) diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp deleted file mode 100644 index d165b2235c..0000000000 --- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/NEON/functions/NESimpleAssemblyFunction.h" - -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT - : _kernel() -{ -} - -void NESimpleAssemblyFunction::run() -{ - NEScheduler::get().schedule(_kernel.get(), Window::DimX); -} - -void NESimpleAssemblyFunction::configure(std::unique_ptr kernel) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get()); - _kernel = std::move(kernel); - ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1); -} diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h b/src/runtime/NEON/functions/NESimpleAssemblyFunction.h deleted file mode 100644 index e9be54d35f..0000000000 --- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H -#define ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H - -#include "arm_compute/runtime/IFunction.h" -#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" - -#include - -namespace arm_compute -{ -/** Basic interface for functions which have a single NEON GEMM wrapper kernel to run */ -class NESimpleAssemblyFunction : public IFunction -{ -public: - /** Constructor */ - NESimpleAssemblyFunction(); - - /** Configure the function with the kernel to run - * - * @param[in] kernel GEMM Wrapper kernel configured and ready to run - * - * @note The kernel is expected to have a 1D window. The function will multi-thread this window across the X dimension. - */ - void configure(std::unique_ptr kernel); - - // Inherited methods overridden: - void run() override final; - -protected: - std::unique_ptr _kernel; /**< Kernel to run */ -}; -} //namespace arm_compute -#endif /*ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H */ diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 80615c5d57..112188fdfa 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -45,6 +46,20 @@ namespace test { namespace validation { +namespace detail +{ +template <> +void configure_conv_function(NEGEMMConv2d &func, + Tensor *src, const Tensor *weights, const Tensor *bias, Tensor *dst, + const PadStrideInfo &info, const WeightsInfo &weights_info, + const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +{ + ARM_COMPUTE_UNUSED(weights_info); + + Conv2dInfo conv_info(info, dilation, act_info, false, num_groups); + func.configure(src, weights, bias, dst, conv_info); +} +} // namespace detail namespace { const RelativeTolerance rel_tolerance_f32(0.01f); /**< Relative tolerance for FP32 types */ @@ -368,7 +383,7 @@ TEST_SUITE_END() // WinogradLayer TEST_SUITE(GEMMConvolutionLayer) template -using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture; +using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture; TEST_SUITE(Float) #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) @@ -413,10 +428,10 @@ TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float template -using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture; +using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture; template -using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture; +using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture; const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo", { @@ -480,6 +495,82 @@ TEST_SUITE_END() // QSYMM8_PER_CHANNEL TEST_SUITE_END() // Quantized TEST_SUITE_END() // GEMMConvolutionLayer + +TEST_SUITE(DirectGEMMConv2d) +template +using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true })), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32)); +} +TEST_SUITE_END() // FP32 +TEST_SUITE_END() // Float + +template +using NEDirectGEMMConv2dLayerQuantizedFixture = ConvolutionValidationQuantizedFixture; + +template +using NEDirectGEMMConv2dLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture; + +const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo", +{ + ActivationLayerInfo(), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f) +}); +TEST_SUITE(Quantized) +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true })), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), + QuantizedActivationFunctionsDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8 + +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })), + QuantizedActivationFunctionsDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8_SIGNED + +TEST_SUITE(QSYMM8_PER_CHANNEL) +FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEDirectGEMMConv2dLayerQuantizedPerChannelFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true })), + framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + QuantizationData), + QuantizedActivationFunctionsDataset), + framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QSYMM8_PER_CHANNEL +TEST_SUITE_END() // Quantized + +TEST_SUITE_END() // DirectGEMMConv2d + TEST_SUITE_END() // NEON } // namespace validation } // namespace test diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp index 9fe7e55de7..04282c2c3c 100644 --- a/tests/validation/NEON/GEMMLowp.cpp +++ b/tests/validation/NEON/GEMMLowp.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/Tensor.h" @@ -53,28 +52,6 @@ const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framew } // namespace TEST_SUITE(NEON) -TEST_SUITE(ASSEMBLY_MATRIX_MULTIPLY) - -using NEGEMMAssemblyFixture_S8 = GEMMLowpAssemblyFixture; -using NEGEMMAssemblyFixture_U8 = GEMMLowpAssemblyFixture; - -TEST_SUITE(S8) -FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_S8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply) -{ - // Validate output - validate(Accessor(_target), _reference); -} -TEST_SUITE_END() - -TEST_SUITE(U8) -FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_U8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply) -{ - // Validate output - validate(Accessor(_target), _reference); -} -TEST_SUITE_END() -TEST_SUITE_END() - TEST_SUITE(GEMMLowp) TEST_SUITE(MatrixMultiplyCore) using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture; diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h index ec13e1d3e0..e1452f5dfc 100644 --- a/tests/validation/fixtures/ConvolutionLayerFixture.h +++ b/tests/validation/fixtures/ConvolutionLayerFixture.h @@ -42,12 +42,22 @@ namespace arm_compute { -class NEConvolutionLayer; - namespace test { namespace validation { +namespace detail +{ +template +void configure_conv_function(ConvolutionFunction &func, + TensorType *src, const TensorType *weights, const TensorType *bias, TensorType *dst, + const PadStrideInfo &info, const WeightsInfo &weights_info, + const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +{ + func.configure(src, weights, bias, dst, info, weights_info, dilation, act_info, num_groups); +} +} // namespace detail + template class ConvolutionValidationGenericFixture : public framework::Fixture { @@ -171,7 +181,7 @@ protected: // Create and configure function FunctionType conv; - conv.configure(&src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups); + detail::configure_conv_function(conv, &src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups); ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS); -- cgit v1.2.1